aboutsummaryrefslogtreecommitdiff
path: root/web/html2x.pl
blob: a034f0e58fd490f203e89a25b26ceff84a6986da (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/env perl
use strict;
use CGI qw/:standard/;
use CGI::Carp 'fatalsToBrowser';

$CGI::POST_MAX=1024 * 100;  # max 100K posts
$CGI::DISABLE_UPLOADS = 1;  # no uploads
      
param('url') && param('format') or die "Missing url and/or format parameters.\n";

my $options = '-r html --standalone --reference-links';	
my $url = param('url');
my $format = param('format') || 'markdown';
if ($format =~ /^markdown$/) {
  $options .= ' --strict';
}
if ($format =~ /^markdown\+$/) {
  $format = 'markdown';
}

# Validate URL and format
unless ($url =~ /^(https?:\/\/)?[\w#?_-]+(\.[\w#?_-]+)+[\w\/#?_.-]*$/) {
  die "Illegal URL: $url\n" ;
}
unless ($format =~ /^markdown\+?|rst|latex|context|rtf|man|docbook$/) {
  die "Illegal format: $format\n";
}

my $output = `wget -O- $url | tidy -asxhtml -utf8 | pandoc -w $format $options`;
if ($output =~ /^\s*$/) {
  print start_html,
        h1("No output"),
        p("Either $url could not be retrieved, or its HTML was too malformed to parse."),
        end_html;
  exit 0;
}
print header(-charset=>"utf8",-type=>"text/plain"),
      $output;