diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/Main.hs | 142 | ||||
-rw-r--r-- | src/wrappers/checkin.sh | 7 | ||||
-rw-r--r-- | src/wrappers/common.sh | 27 | ||||
-rw-r--r-- | src/wrappers/getopts.sh | 12 | ||||
-rw-r--r-- | src/wrappers/html2markdown.in | 134 | ||||
-rw-r--r-- | src/wrappers/latex2markdown.in | 14 | ||||
-rw-r--r-- | src/wrappers/markdown2html.in | 12 | ||||
-rw-r--r-- | src/wrappers/markdown2latex.in | 12 | ||||
-rw-r--r-- | src/wrappers/markdown2pdf.in | 68 | ||||
-rw-r--r-- | src/wrappers/postopts.sh | 17 | ||||
-rw-r--r-- | src/wrappers/singlearg.sh | 7 | ||||
-rw-r--r-- | src/wrappers/testwrapper.in | 141 | ||||
-rw-r--r-- | src/wrappers/web2markdown.in | 173 |
13 files changed, 318 insertions, 448 deletions
diff --git a/src/Main.hs b/src/Main.hs index 542e521f6..0f8567517 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -45,6 +45,7 @@ import Text.Pandoc.Writers.DefaultHeaders ( defaultHtmlHeader, defaultRTFHeader, defaultS5Header, defaultLaTeXHeader ) import Text.Pandoc.Definition import Text.Pandoc.Shared +import Text.Regex ( mkRegex, splitRegex ) import System ( exitWith, getArgs, getProgName ) import System.Exit import System.Console.GetOpt @@ -57,6 +58,9 @@ import Control.Monad ( (>>=) ) version :: String version = "0.3" +copyrightMessage :: String +copyrightMessage = "\nCopyright (C) 2006 John MacFarlane\nWeb: http://sophos.berkeley.edu/macfarlane/pandoc\nThis is free software; see the source for copying conditions. There is no\nwarranty, not even for merchantability or fitness for a particular purpose." + -- | Association list of formats and readers. readers :: [(String, ParserState -> String -> Pandoc)] readers = [("native" , readPandoc) @@ -101,10 +105,13 @@ data Opt = Opt , optCustomHeader :: String -- ^ Custom header to use, or "DEFAULT" , optDefaultHeader :: String -- ^ Default header , optTitlePrefix :: String -- ^ Optional prefix for HTML title + , optOutputFile :: String -- ^ Name of output file , optNumberSections :: Bool -- ^ If @True@, number sections in LaTeX , optIncremental :: Bool -- ^ If @True@, incremental lists in S5 , optSmart :: Bool -- ^ If @True@, use smart typography , optASCIIMathML :: Bool -- ^ If @True@, use ASCIIMathML in HTML + , optShowUsage :: Bool -- ^ If @True@, show usage message + , optDebug :: Bool -- ^ If @True@, output debug messages } -- | Defaults for command-line options. @@ -123,32 +130,20 @@ startOpt = Opt , optCustomHeader = "DEFAULT" , optDefaultHeader = defaultHtmlHeader , optTitlePrefix = "" + , optOutputFile = "" -- null for stdout , optNumberSections = False , optIncremental = False , optSmart = False , optASCIIMathML = False + , optShowUsage = False + , optDebug = False } -- | A list of functions, each transforming the options data structure in response -- to a command-line option. -options :: [OptDescr (Opt -> IO Opt)] -options = - [ Option "v" ["version"] - (NoArg - (\_ -> do - hPutStrLn stderr ("Version " ++ version) - exitWith ExitSuccess)) - "Print version" - - , Option "h" ["help"] - (NoArg - (\_ -> do - prg <- getProgName - hPutStrLn stderr (usageInfo (prg ++ " [OPTIONS] [FILES] - convert FILES from one markup format to another\nIf no OPTIONS specified, converts from markdown to html.\nIf no FILES specified, input is read from STDIN.\nOptions:") options) - exitWith ExitSuccess)) - "Show help" - - , Option "fr" ["from","read"] +allOptions :: [OptDescr (Opt -> IO Opt)] +allOptions = + [ Option "fr" ["from","read"] (ReqArg (\arg opt -> case (lookup (map toLower arg) readers) of Just reader -> return opt { optReader = reader } @@ -172,6 +167,13 @@ options = (\opt -> return opt { optStandalone = True })) "Include needed header and footer on output" + , Option "o" ["output"] + (ReqArg + (\arg opt -> do + return opt { optOutputFile = arg }) + "FILENAME") + "Name of output file" + , Option "p" ["preserve-tabs"] (NoArg (\opt -> return opt { optPreserveTabs = True })) @@ -241,7 +243,7 @@ options = "FILENAME") "File to include after document body" - , Option "" ["custom-header"] + , Option "C" ["custom-header"] (ReqArg (\arg opt -> do text <- readFile arg @@ -263,18 +265,87 @@ options = let header = case (lookup arg writers) of Just (writer, head) -> head Nothing -> error ("Unknown reader: " ++ arg) - hPutStrLn stdout header + hPutStr stdout header exitWith ExitSuccess) "FORMAT") "Print default header for FORMAT" + + , Option "d" ["debug"] + (NoArg + (\opt -> return opt { optDebug = True })) + "Print debug messages to stderr, output to stdout" + + , Option "v" ["version"] + (NoArg + (\_ -> do + prg <- getProgName + hPutStrLn stderr (prg ++ " " ++ version ++ + copyrightMessage) + exitWith $ ExitFailure 2)) + "Print version" + + , Option "h" ["help"] + (NoArg + (\opt -> return opt { optShowUsage = True })) + "Show help" ] + +-- parse name of calling program and return default reader and writer descriptions +parseProgName name = + case (splitRegex (mkRegex "2") (map toLower name)) of + [from, to] -> (from, to) + _ -> ("markdown", "html") + +-- set default options based on reader and writer descriptions; start is starting options +setDefaultOpts from to start = + case ((lookup from readers), (lookup to writers)) of + (Just reader, Just (writer, header)) -> start {optReader = reader, + optWriter = writer, + optDefaultHeader = header} + _ -> start + +-- True if single-letter option is in option list +inOptList :: [Char] -> OptDescr (Opt -> IO Opt) -> Bool +inOptList list desc = + let (Option letters _ _ _) = desc in + any (\x -> x `elem` list) letters + +-- Reformat usage message so it doesn't wrap illegibly +reformatUsageInfo = gsub " *--" " --" . + gsub "(-[A-Za-z0-9]) *--" "\\1, --" . + gsub " *([^- ])" "\n\t\\1" + main = do + name <- getProgName + let (from, to) = parseProgName name + + let irrelevantOptions = if not ('2' `elem` name) + then "" + else "frtwD" ++ + (if (to /= "html" && to /= "s5") then "SmcT" else "") ++ + (if (to /= "latex") then "N" else "") ++ + (if (to /= "s5") then "i" else "") ++ + (if (from /= "html" && from /= "latex") then "R" else "") + + let options = filter (not . inOptList irrelevantOptions) allOptions + + let defaultOpts = setDefaultOpts from to startOpt + args <- getArgs - let (actions, sources, errors) = getOpt RequireOrder options args + let (actions, sources, errors) = getOpt Permute options args + + if (not (null errors)) + then do + mapM (\e -> hPutStrLn stderr e) errors + hPutStrLn stderr (reformatUsageInfo $ + usageInfo (name ++ " [OPTIONS] [FILES]") options) + exitWith $ ExitFailure 2 + else + return () -- thread option data structure through all supplied option actions - opts <- foldl (>>=) (return startOpt) actions + opts <- foldl (>>=) (return defaultOpts) actions let Opt { optPreserveTabs = preserveTabs , optTabStop = tabStop @@ -289,12 +360,31 @@ main = do , optCustomHeader = customHeader , optDefaultHeader = defaultHeader , optTitlePrefix = titlePrefix + , optOutputFile = outputFile , optNumberSections = numberSections , optIncremental = incremental , optSmart = smart , optASCIIMathML = asciiMathML + , optShowUsage = showUsage + , optDebug = debug } = opts + if showUsage + then do + hPutStr stderr (reformatUsageInfo $ usageInfo (name ++ " [OPTIONS] [FILES]") options) + exitWith $ ExitFailure 2 + else return () + + output <- if ((null outputFile) || debug) + then return stdout + else openFile outputFile WriteMode + + if debug + then do + hPutStrLn stderr ("OUTPUT=" ++ outputFile) + hPutStr stderr $ concatMap (\s -> "INPUT=" ++ s ++ "\n") sources + else return () + let writingS5 = (defaultHeader == defaultS5Header) let tabFilter = if preserveTabs then id else (tabsToSpaces tabStop) let addBlank str = str ++ "\n\n" @@ -323,13 +413,13 @@ main = do writerIncludeBefore = includeBefore, writerIncludeAfter = includeAfter } - (readSources sources) >>= (putStr . encodeUTF8 . (writer writerOptions) . + (readSources sources) >>= (hPutStr output . encodeUTF8 . + (writer writerOptions) . (reader startParserState) . filter . - decodeUTF8 . (joinWithSep "\n")) + decodeUTF8 . (joinWithSep "\n")) >> hClose output where readSources [] = mapM readSource ["-"] readSources sources = mapM readSource sources - readSource "-" = getContents + readSource "-" = getContents readSource source = readFile source - diff --git a/src/wrappers/checkin.sh b/src/wrappers/checkin.sh deleted file mode 100644 index c9c564a23..000000000 --- a/src/wrappers/checkin.sh +++ /dev/null @@ -1,7 +0,0 @@ -# Check if input files exist. -for f; do - if [ -n "$f" ] && ! [ -f "$f" ]; then - err "File '$f' not found." - exit 1 - fi -done diff --git a/src/wrappers/common.sh b/src/wrappers/common.sh index 99a83be50..3481affff 100644 --- a/src/wrappers/common.sh +++ b/src/wrappers/common.sh @@ -8,22 +8,6 @@ WRAPPEE_ARGS= err () { echo "$*" | fold -s -w ${COLUMNS:-110} >&2; } errn () { printf "$*" | fold -s -w ${COLUMNS:-110} >&2; } -usage () { - synopsis="$@" - err "Usage: $THIS $synopsis" - err "See $THIS(1) man file for details." -} - -runpandoc () { - if [ -n "$WRAPPEE_ARGS" ]; then - # Unpack arguments that will be passed to pandoc. - oldifs="$IFS"; IFS="$NEWLINE"; set -- $WRAPPEE_ARGS "$@"; IFS="$oldifs" - case "$1" in --) shift;; esac # tolerate the existence of a leading '--' - fi - - pandoc "$@" -} - # Portable which(1). pathfind () { oldifs="$IFS"; IFS=':' @@ -37,17 +21,6 @@ pathfind () { return 1 } -HAVE_ICONV= -if pathfind iconv; then - HAVE_ICONV=1 - alias to_utf8='iconv -t utf-8' - alias from_utf8='iconv -f utf-8' -else - err "Warning: iconv not present. Assuming UTF-8 character encoding." - alias to_utf8='cat' - alias from_utf8='cat' -fi - for p in pandoc $REQUIRED; do pathfind $p || { err "You need '$p' to use this program!" diff --git a/src/wrappers/getopts.sh b/src/wrappers/getopts.sh deleted file mode 100644 index 263263c07..000000000 --- a/src/wrappers/getopts.sh +++ /dev/null @@ -1,12 +0,0 @@ -if [ -z "$SYNOPSIS" ]; then - SYNOPSIS="[-h] [input_file]" - [ -n "$THIS_NARG" ] || SYNOPSIS="${SYNOPSIS}..." -fi - -while getopts h opt; do - case $opt in - h|?) usage "$SYNOPSIS"; exit 2 ;; - esac -done - -shift $(($OPTIND - 1)) diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in deleted file mode 100644 index 0fece3ccd..000000000 --- a/src/wrappers/html2markdown.in +++ /dev/null @@ -1,134 +0,0 @@ -#!/bin/sh -e -# converts html to markdown -# uses an available program to fetch URL and tidy to normalize it first - -REQUIRED=tidy - -### common.sh - -grab_url_with () { - url="${1:?internal error: grab_url_with: url required}" - - shift - cmdline="$@" - - prog= - prog_opts= - if [ -n "$cmdline" ]; then - eval "set -- $cmdline" - prog=$1 - shift - prog_opts="$@" - fi - - if [ -z "$prog" ]; then - # Locate a sensible web grabber (note the order). - for p in wget lynx w3m curl links w3c; do - if pathfind $p; then - prog=$p - break - fi - done - - [ -n "$prog" ] || { - errn "$THIS: Couldn't find a program to fetch the file from URL " - err "(e.g. wget, w3m, lynx, w3c, or curl)." - return 1 - } - else - pathfind "$prog" || { - err "$THIS: No such web grabber '$prog' found; aborting." - return 1 - } - fi - - # Setup proper base options for known grabbers. - base_opts= - case "$prog" in - wget) base_opts="-O-" ;; - lynx) base_opts="-source" ;; - w3m) base_opts="-dump_source" ;; - curl) base_opts="" ;; - links) base_opts="-source" ;; - w3c) base_opts="-n -get" ;; - *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." - esac - - err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." - eval "set -- $base_opts $prog_opts" - $prog "$@" "$url" -} - -encoding= -grabber= -nograb= -while getopts e:g:nh opt; do - case $opt in - e) encoding="$OPTARG" ;; - g) grabber="$OPTARG" ;; - n) nograb=1 ;; - h|?) - usage "[-e encoding] [-g grabber_command] [-n] [-h] [input_file|url]" - exit 2 ;; - esac -done - -shift $(($OPTIND - 1)) - -### postopts.sh - -### singlearg.sh - -inurl= -if [ -n "$1" ] && ! [ -f "$1" ]; then - if [ -n "$nograb" ]; then - err "'$1' not found; refusing to treat input as URL." - exit 1 - fi - # Treat given argument as an URL. - inurl="$1" -fi - -if [ -n "$inurl" ]; then - err "Attempting to fetch file from '$inurl'..." - - ### tempdir.sh - - grabber_out=$THIS_TEMPDIR/grabber.out - grabber_log=$THIS_TEMPDIR/grabber.log - if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out \ - 2>$grabber_log; then - errn "grab_url_with failed" - if [ -f $grabber_log ]; then - err " with the following error log." - err - cat >&2 $grabber_log - else - err . - fi - exit 1 - fi - - set -- $grabber_out -fi - -if [ -z "$encoding" ] && [ "x$@" != "x" ]; then - # Try to determine character encoding unless not specified - # and input is STDIN. - encoding=$( - head "$@" | - LC_ALL=C tr 'A-Z' 'a-z' | - sed -ne '/<meta .*content-type.*charset=/ { - s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p - }' - ) -fi - -if [ -n "$encoding" ] && [ -n "$HAVE_ICONV" ]; then - alias to_utf8='iconv -f "$encoding" -t utf-8' -elif [ -n "$inurl" ]; then # assume web pages are UTF-8 - alias to_utf8='cat' -fi # else just use local encoding - -to_utf8 "$@" | tidy -utf8 2>/dev/null | -runpandoc -r html -w markdown -s | from_utf8 diff --git a/src/wrappers/latex2markdown.in b/src/wrappers/latex2markdown.in deleted file mode 100644 index e8cde8a97..000000000 --- a/src/wrappers/latex2markdown.in +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh -e -# runs pandoc to convert latex to markdown - -### common.sh - -### getopts.sh - -### postopts.sh - -### singlearg.sh - -### checkin.sh - -to_utf8 "$@" | runpandoc -r latex -w markdown -s | from_utf8 diff --git a/src/wrappers/markdown2html.in b/src/wrappers/markdown2html.in deleted file mode 100644 index e255398d2..000000000 --- a/src/wrappers/markdown2html.in +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh -e -# converts markdown to HTML - -### common.sh - -### getopts.sh - -### postopts.sh - -### checkin.sh - -to_utf8 "$@" | runpandoc | from_utf8 diff --git a/src/wrappers/markdown2latex.in b/src/wrappers/markdown2latex.in deleted file mode 100644 index c532b2f99..000000000 --- a/src/wrappers/markdown2latex.in +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh -e -# converts markdown to latex - -### common.sh - -### getopts.sh - -### postopts.sh - -### checkin.sh - -to_utf8 "$@" | runpandoc -w latex -s | from_utf8 diff --git a/src/wrappers/markdown2pdf.in b/src/wrappers/markdown2pdf.in index 838767224..c222c1cbd 100644 --- a/src/wrappers/markdown2pdf.in +++ b/src/wrappers/markdown2pdf.in @@ -1,64 +1,54 @@ #!/bin/sh -e -# converts markdown to latex, then uses latex to make a PDF -REQUIRED=pdflatex +REQUIRED="markdown2latex pdflatex" ### common.sh -outfile= -while getopts o:h opt; do - case $opt in - o) outfile="$OPTARG" ;; - h|?) usage "[-o output_file] [-h] [input_file]..."; exit 2 ;; - esac -done - -shift $(($OPTIND - 1)) - -### postopts.sh +### tempdir.sh -### checkin.sh +texname=output +logfile=$THIS_TEMPDIR/log -if [ -z "$outfile" ]; then - if [ -n "$1" ]; then - outfile="${1%.*}" - else - outfile="stdin" # input is STDIN, since no argument given - fi +if ! markdown2latex -s -d "$@" >$THIS_TEMPDIR/$texname.tex 2>$logfile; then + [ -f $logfile ] && sed -e 's/markdown2latex/markdown2pdf/g' \ + -e '/^INPUT=/d' -e '/^OUTPUT=/d' $logfile >&2 + exit 1 fi -case "$outfile" in -*.*) ;; # skip appending extension if one is already present -*) outfile="${outfile%.*}.pdf";; -esac -### tempdir.sh - -# We should use a filename without white spaces for pdflatex. -TEXNAME=$THIS +outfile="$(sed -ne 's/^OUTPUT=//p' $logfile)" +IFS="$NEWLINE" +set -- $(sed -ne 's/^INPUT=//p' $logfile) +firstinfilebase="${1%.*}" +defaultdest="${firstinfilebase:-stdin}.pdf" +destname="${outfile:-$defaultdest}" -to_utf8 "$@" | runpandoc -w latex -s >$THIS_TEMPDIR/$TEXNAME.tex ( cd $THIS_TEMPDIR - if ! pdflatex -interaction=batchmode $TEXNAME.tex >/dev/null 2>&1; then + if ! pdflatex -interaction=batchmode $texname.tex >/dev/null 2>&1; then err "LaTeX errors:" - from_utf8 $TEXNAME.log | sed -ne '/^!/,/^ *$/p' >&2 - if grep -q "File \`ucs.sty' not found" $TEXNAME.log; then - err "Please install the 'unicode' package from ctan.org." + sed -ne '/^!/,/^ *$/p' $texname.log >&2 + if grep -q "File \`ucs.sty' not found" $texname.log; then + err "Please install the 'unicode' package from CTAN:" + err "http://www.ctan.org/tex-archive/macros/latex/contrib/unicode/" + fi + if grep -q "File \`fancyvrb.sty' not found" $texname.log; then + err "Please install the 'fancyvrb' package from CTAN:" + err "http://www.ctan.org/tex-archive/macros/latex/contrib/fancyvrb/" fi exit 1 fi -) +) || exit $? is_target_exists= -if [ -f "$outfile" ]; then +if [ -f "$destname" ]; then is_target_exists=1 - mv -f "$outfile" "$outfile~" + mv "$destname" "$destname~" fi -mv -f $THIS_TEMPDIR/$TEXNAME.pdf "$outfile" +mv -f $THIS_TEMPDIR/$texname.pdf "$destname" -errn "Created '$outfile'" +errn "Created $destname" [ -z "$is_target_exists" ] || { - errn " (previous file has been backed up as '$outfile~')" + errn " (previous file has been backed up as $destname~)" } err . diff --git a/src/wrappers/postopts.sh b/src/wrappers/postopts.sh deleted file mode 100644 index e0d015f41..000000000 --- a/src/wrappers/postopts.sh +++ /dev/null @@ -1,17 +0,0 @@ -# Parse wrapper and wrappee (pandoc) arguments by taking -# into account that they may have space or tab characters. -pick="WRAPPER_ARGS" -while [ $# -gt 0 ]; do - if [ "$pick" = "WRAPPER_ARGS" ]; then - case "$1" in - -*) pick="WRAPPEE_ARGS" ;; - esac - fi - # Pack args with NEWLINE to preserve spaces, - # and put them into the picked variable. - eval "$pick=\"\$${pick}${NEWLINE}${1}\"" - shift -done - -# Unpack filename arguments. Now "$@" will hold the filenames. -oldifs="$IFS"; IFS="$NEWLINE"; set -- $WRAPPER_ARGS; IFS="$oldifs" diff --git a/src/wrappers/singlearg.sh b/src/wrappers/singlearg.sh deleted file mode 100644 index f742d1383..000000000 --- a/src/wrappers/singlearg.sh +++ /dev/null @@ -1,7 +0,0 @@ -# Ensure to work with a single argument. -if [ $# -gt 1 ]; then - first_arg="$1" - shift - err "Warning: extra arguments '$@' will be ignored." - set -- $first_arg -fi diff --git a/src/wrappers/testwrapper.in b/src/wrappers/testwrapper.in deleted file mode 100644 index e025c87e7..000000000 --- a/src/wrappers/testwrapper.in +++ /dev/null @@ -1,141 +0,0 @@ -#!/bin/sh - -THIS=$1 - -ASH="ash -s" -BASH="bash --posix -s" -DASH="dash -s" -KSH="ksh -s" -POSH="posh -s" -ZSH="zsh -s" - -ERROR="" - -wrapper () { - $SH -- "$@" <<-'EOF' -### common.sh - -outfile= -while getopts o: opt; do - case $opt in - o) outfile="$OPTARG" ;; - esac -done - -shift $(($OPTIND - 1)) - -### postopts.sh - -echo "Options passed to wrapper:" -[ -z "$outfile" ] || echo "|$outfile|" - -echo "Arguments passed to wrapper:" -for arg; do - echo "|$arg|" -done - -pandoc () { - echo "Arguments passed to wrappee:" - for arg; do - echo "|$arg|" - done -} -runpandoc -EOF -} - -# Portable which(1). -pathfind () { - oldifs="$IFS"; IFS=':' - for _p in $PATH; do - if [ -x "$_p/$*" ] && [ -f "$_p/$*" ]; then - IFS="$oldifs" - return 0 - fi - done - IFS="$oldifs" - return 1 -} - -check_results () { - if [ "$1" = "$2" ]; then - echo >&2 ok - return 0 - else - echo >&2 failed - sed "s/^/\t/" >&2 <<EOF -Command line: '$3' -===> Expected: -$2 -<=== Got: -$1 -EOF - return 1 - fi -} - -for SH in "$BASH" "$DASH" "$KSH" "$ZSH"; do - CMD=${SH%% *} - echo >&2 " Testing with $CMD..." - if pathfind "$CMD"; then - if [ "$CMD" = "zsh" ]; then - # Zsh needs to be called as 'sh' to enable POSIX mode. - ln -s $(which zsh) ./sh - SH="./sh ${SH#* }" - trap 'err=$?; rm -f ./sh; exit $err' 0 1 2 3 13 15 - fi - - set -e - - # Test 1 - printf >&2 " test case 1... " - actual=$(wrapper -o "output file" "foo bar" -A "quux baz" -B) - expected=$(cat <<'EOF' -Options passed to wrapper: -|output file| -Arguments passed to wrapper: -|foo bar| -Arguments passed to wrappee: -|-A| -|quux baz| -|-B| -EOF -) - check_results "$actual" "$expected" \ - 'wrapper -o "output file" "foo bar" -A "quux baz" -B' - - # Test 2 - printf >&2 " test case 2... " - actual=$(wrapper -- -A "foo bar") - expected=$(cat <<'EOF' -Options passed to wrapper: -Arguments passed to wrapper: -Arguments passed to wrappee: -|-A| -|foo bar| -EOF -) - check_results "$actual" "$expected" 'wrapper -- -A "foo bar"' - - # Test 3 (Test 1 with a redundant '--') - printf >&2 " test case 3... " - actual=$(wrapper -o "output file" "foo bar" -- -A "quux baz" -B) - expected=$(cat <<'EOF' -Options passed to wrapper: -|output file| -Arguments passed to wrapper: -|foo bar| -Arguments passed to wrappee: -|-A| -|quux baz| -|-B| -EOF -) - check_results "$actual" "$expected" \ - 'wrapper -o "output file" "foo bar" -- -A "quux baz" -B' - else - echo >&2 "Warning: cannot verify correctness with $CMD; shell not available" - fi -done - -exit 0 diff --git a/src/wrappers/web2markdown.in b/src/wrappers/web2markdown.in new file mode 100644 index 000000000..64ff3db9b --- /dev/null +++ b/src/wrappers/web2markdown.in @@ -0,0 +1,173 @@ +#!/bin/sh -e +# converts HTML from a URL, file, or stdin to markdown +# uses an available program to fetch URL and tidy to normalize it first + +REQUIRED="tidy html2markdown" + +### common.sh + +grab_url_with () { + url="${1:?internal error: grab_url_with: url required}" + + shift + cmdline="$@" + + prog= + prog_opts= + if [ -n "$cmdline" ]; then + eval "set -- $cmdline" + prog=$1 + shift + prog_opts="$@" + fi + + if [ -z "$prog" ]; then + # Locate a sensible web grabber (note the order). + for p in wget lynx w3m curl links w3c; do + if pathfind $p; then + prog=$p + break + fi + done + + [ -n "$prog" ] || { + errn "$THIS: Couldn't find a program to fetch the file from URL " + err "(e.g. wget, w3m, lynx, w3c, or curl)." + return 1 + } + else + pathfind "$prog" || { + err "$THIS: No such web grabber '$prog' found; aborting." + return 1 + } + fi + + # Setup proper base options for known grabbers. + base_opts= + case "$prog" in + wget) base_opts="-O-" ;; + lynx) base_opts="-source" ;; + w3m) base_opts="-dump_source" ;; + curl) base_opts="" ;; + links) base_opts="-source" ;; + w3c) base_opts="-n -get" ;; + *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." + esac + + err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." + eval "set -- $base_opts $prog_opts" + $prog "$@" "$url" +} + +add_option () { + options="$options$NEWLINE$1" +} + +options= +argument= +encoding= +grabber= + +# Parse command-line arguments +while [ $# -gt 0 ]; do + case "$1" in + -h|--help) + html2markdown -h 2>&1 | sed -e 's/html2markdown/web2markdown/' 1>&2 + err " -e ENCODING, --encoding=ENCODING" + err " Specify character encoding of input" + err " -g COMMAND, --grabber=COMMAND" + err " Specify command to be used to grab contents of URL" + exit 0 ;; + -v|--version) + html2markdown -v + exit 0 ;; + -e) + shift + encoding=$1 ;; + --encoding=*) + wholeopt=$1 + # extract encoding from after = + encoding=${wholeopt#*=} ;; + -g) + shift + grabber=$1 ;; + --grabber=*) + wholeopt=$1 + # extract encoding from after = + grabber=${wholeopt#*=} ;; + -o|--output|-b|--tab-stop|-H|--include-in-header| \ + -A|--include-after-body|-C|-B|--include-before-body| \ + -C|--custom-header|-T|--title-prefix) + add_option $1 + shift + add_option $1 ;; + -*) add_option $1 ;; + *) + if [ -z "$argument" ]; then + argument=$1 + else + err "Warning: extra argument '$1' will be ignored." + fi ;; + esac + shift +done + +# Unpack options. Now "$@" will hold the html2markdown options. +oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs" + +inurl= +if [ -n "$argument" ] && ! [ -f "$argument" ]; then + # Treat given argument as an URL. + inurl="$argument" +fi + +if [ -n "$inurl" ]; then + err "Attempting to fetch file from '$inurl'..." + + ### tempdir.sh + + grabber_out=$THIS_TEMPDIR/grabber.out + grabber_log=$THIS_TEMPDIR/grabber.log + if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then + errn "grab_url_with failed" + if [ -f $grabber_log ]; then + err " with the following error log." + err + cat >&2 $grabber_log + else + err . + fi + exit 1 + fi + + argument="$grabber_out" +fi + +if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then + # Try to determine character encoding if not specified + # and input is not STDIN. + encoding=$( + head "$argument" | + LC_ALL=C tr 'A-Z' 'a-z' | + sed -ne '/<meta .*content-type.*charset=/ { + s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p + }' + ) +fi + +if [ -n "$encoding" ] && pathfind iconv; then + alias to_utf8='iconv -f "$encoding" -t utf-8' +else # assume UTF-8 + alias to_utf8='cat' +fi + +if [ -z "$argument" ]; then + tidy -utf8 2>/dev/null | html2markdown "$@" +else + if [ -f "$argument" ]; then + to_utf8 "$argument" | tidy -utf8 2>/dev/null | html2markdown "$@" + else + err "File '$argument' not found." + exit 1 + fi +fi |