diff options
| author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-11-03 23:27:58 +0000 | 
|---|---|---|
| committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-11-03 23:27:58 +0000 | 
| commit | fe684764e68e7eda281192f1fdd637a5bdb50e43 (patch) | |
| tree | acd3377ff911700adad9609d475e115c89eddeb8 /src/Text | |
| parent | 4a841bfc5464907adea4cdd655485565565b40ae (diff) | |
| download | pandoc-fe684764e68e7eda281192f1fdd637a5bdb50e43.tar.gz | |
Reverted back to state as of r1062.  The template haskell changes
are more trouble than they're worth.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@1064 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text')
| -rw-r--r-- | src/Text/Pandoc.hs | 110 | ||||
| -rw-r--r-- | src/Text/Pandoc/Blocks.hs | 145 | ||||
| -rw-r--r-- | src/Text/Pandoc/CharacterReferences.hs | 327 | ||||
| -rw-r--r-- | src/Text/Pandoc/Definition.hs | 116 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 496 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 651 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 909 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/RST.hs | 640 | ||||
| -rw-r--r-- | src/Text/Pandoc/Shared.hs | 792 | ||||
| -rw-r--r-- | src/Text/Pandoc/UTF8.hs | 45 | ||||
| -rw-r--r-- | src/Text/Pandoc/Writers/ConTeXt.hs | 248 | ||||
| -rw-r--r-- | src/Text/Pandoc/Writers/Docbook.hs | 299 | ||||
| -rw-r--r-- | src/Text/Pandoc/Writers/HTML.hs | 458 | ||||
| -rw-r--r-- | src/Text/Pandoc/Writers/LaTeX.hs | 310 | ||||
| -rw-r--r-- | src/Text/Pandoc/Writers/Man.hs | 293 | ||||
| -rw-r--r-- | src/Text/Pandoc/Writers/Markdown.hs | 373 | ||||
| -rw-r--r-- | src/Text/Pandoc/Writers/RST.hs | 325 | ||||
| -rw-r--r-- | src/Text/Pandoc/Writers/RTF.hs | 286 | 
18 files changed, 6823 insertions, 0 deletions
diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs new file mode 100644 index 000000000..7633bf7ef --- /dev/null +++ b/src/Text/Pandoc.hs @@ -0,0 +1,110 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha  +   Portability : portable + +This helper module exports the main writers, readers, and data +structure definitions from the Pandoc libraries. + +A typical application will chain together a reader and a writer +to convert strings from one format to another.  For example, the +following simple program will act as a filter converting markdown +fragments to reStructuredText, using reference-style links instead of +inline links: + +> module Main where +> import Text.Pandoc +>  +> markdownToRST :: String -> String +> markdownToRST = toUTF8 . +>   (writeRST defaultWriterOptions {writerReferenceLinks = True}) . +>   (readMarkdown defaultParserState) .  fromUTF8 +>  +> main = interact markdownToRST + +-} + +module Text.Pandoc +               (  +               -- * Definitions +               module Text.Pandoc.Definition +               -- * Readers: converting /to/ Pandoc format +               , readMarkdown +               , readRST +               , readLaTeX +               , readHtml +               -- * Parser state used in readers +               , ParserState (..) +               , defaultParserState +               , ParserContext (..) +               , QuoteContext (..) +               , KeyTable +               , NoteTable +               , HeaderType (..) +               -- * Writers: converting /from/ Pandoc format +               , writeMarkdown +               , writeRST +               , writeLaTeX +               , writeConTeXt +               , writeHtml +               , writeHtmlString +               , writeS5 +               , writeS5String +               , writeDocbook +               , writeMan +               , writeRTF +               , prettyPandoc +               -- * Writer options used in writers  +               , WriterOptions (..) +               , defaultWriterOptions +               -- * Default headers for various output formats +               , module Text.Pandoc.Writers.DefaultHeaders +               -- * Functions for converting to and from UTF-8 +               , module Text.Pandoc.UTF8 +               -- * Version +               , pandocVersion +             ) where + +import Text.Pandoc.Definition +import Text.Pandoc.Readers.Markdown +import Text.Pandoc.Readers.RST +import Text.Pandoc.Readers.LaTeX +import Text.Pandoc.Readers.HTML +import Text.Pandoc.Writers.Markdown +import Text.Pandoc.Writers.RST  +import Text.Pandoc.Writers.LaTeX +import Text.Pandoc.Writers.ConTeXt +import Text.Pandoc.Writers.HTML +import Text.Pandoc.Writers.S5 +import Text.Pandoc.Writers.Docbook +import Text.Pandoc.Writers.Man +import Text.Pandoc.Writers.RTF  +import Text.Pandoc.Writers.DefaultHeaders +import Text.Pandoc.UTF8 +import Text.Pandoc.Shared + +-- | Version number of pandoc library. +pandocVersion :: String +pandocVersion = "0.45" diff --git a/src/Text/Pandoc/Blocks.hs b/src/Text/Pandoc/Blocks.hs new file mode 100644 index 000000000..cfc22cb3e --- /dev/null +++ b/src/Text/Pandoc/Blocks.hs @@ -0,0 +1,145 @@ +{- +Copyright (C) 2007 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Blocks +   Copyright   : Copyright (C) 2007 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +Functions for the manipulation of fixed-width blocks of text. +These are used in the construction of plain-text tables. +-} + +module Text.Pandoc.Blocks +               (  +                TextBlock (..), +                docToBlock, +                blockToDoc, +                widthOfBlock, +                heightOfBlock, +                hcatBlocks, +                hsepBlocks, +                centerAlignBlock, +                leftAlignBlock, +                rightAlignBlock +               ) +where +import Text.PrettyPrint +import Data.List ( intersperse ) + +-- | A fixed-width block of text.  Parameters are width of block, +-- height of block, and list of lines. +data TextBlock = TextBlock Int Int [String] +instance Show TextBlock where +  show x = show $ blockToDoc x + +-- | Break lines in a list of lines so that none are greater than +-- a given width. +breakLines :: Int      -- ^ Maximum length of lines. +           -> [String] -- ^ List of lines. +           -> [String] +breakLines width [] = [] +breakLines width (l:ls) = +  if length l > width +     then (take width l):(breakLines width ((drop width l):ls)) +     else l:(breakLines width ls) + +-- | Convert a @Doc@ element into a @TextBlock@ with a specified width.  +docToBlock :: Int  -- ^ Width of text block. +           -> Doc  -- ^ @Doc@ to convert. +           -> TextBlock +docToBlock width doc = +  let rendered    = renderStyle (style {lineLength = width,  +                                        ribbonsPerLine = 1}) doc +      lns         = breakLines width $ lines rendered +  in  TextBlock width (length lns) lns + +-- | Convert a @TextBlock@ to a @Doc@ element. +blockToDoc :: TextBlock -> Doc +blockToDoc (TextBlock _ _ lns) =  +  if null lns +     then empty +     else vcat $ map text lns + +-- | Returns width of a @TextBlock@ (number of columns). +widthOfBlock :: TextBlock -> Int +widthOfBlock (TextBlock width _ _) = width + +-- | Returns height of a @TextBlock@ (number of rows). +heightOfBlock :: TextBlock -> Int +heightOfBlock (TextBlock _ height _) = height + +-- | Pads a string out to a given width using spaces. +hPad :: Int     -- ^ Desired width. +     -> String  -- ^ String to pad. +     -> String +hPad width line =  +  let lineLength = length line +  in  if lineLength <= width  +         then line ++ replicate (width - lineLength) ' ' +         else take width line + +-- | Concatenates a list of @TextBlock@s into a new @TextBlock@ in +-- which they appear side by side. +hcatBlocks :: [TextBlock] -> TextBlock +hcatBlocks [] = TextBlock 0 0 [] +hcatBlocks [x] = x -- This is not redundant!  We don't want last item hPad'd. +hcatBlocks ((TextBlock width1 height1 lns1):xs) =  +  let (TextBlock width2 height2 lns2) = hcatBlocks xs +      height = max height1 height2 +      width  = width1 + width2 +      lns1'  = map (hPad width1) $ lns1 ++ replicate (height - height1) "" +      lns2'  = lns2 ++ replicate (height - height2) "" +      lns    = zipWith (++) lns1' lns2' +   in TextBlock width height lns  + +-- | Like @hcatBlocks@, but inserts space between the @TextBlock@s. +hsepBlocks :: [TextBlock] -> TextBlock +hsepBlocks = hcatBlocks . (intersperse (TextBlock 1 1 [" "])) + +isWhitespace x = x `elem` " \t" + +-- | Left-aligns the contents of a @TextBlock@ within the block. +leftAlignBlock :: TextBlock -> TextBlock +leftAlignBlock (TextBlock width height lns) = +  TextBlock width height $ map (dropWhile isWhitespace) lns + +-- | Right-aligns the contents of a @TextBlock@ within the block. +rightAlignBlock :: TextBlock -> TextBlock +rightAlignBlock (TextBlock width height lns) = +  let rightAlignLine ln =  +        let (spaces, rest) = span isWhitespace $ reverse $ hPad width ln +        in  reverse (rest ++ spaces) +  in  TextBlock width height $ map rightAlignLine lns + +-- | Centers the contents of a @TextBlock@ within the block. +centerAlignBlock :: TextBlock -> TextBlock +centerAlignBlock (TextBlock width height lns) =  +  let centerAlignLine ln = +        let ln' = hPad width ln +            (startSpaces, rest) = span isWhitespace ln' +            endSpaces = takeWhile isWhitespace (reverse ln') +            numSpaces = length (startSpaces ++ endSpaces) +            startSpaces' = replicate (quot numSpaces 2) ' ' +        in  startSpaces' ++ rest  +  in  TextBlock width height $ map centerAlignLine lns + diff --git a/src/Text/Pandoc/CharacterReferences.hs b/src/Text/Pandoc/CharacterReferences.hs new file mode 100644 index 000000000..466f5d8f4 --- /dev/null +++ b/src/Text/Pandoc/CharacterReferences.hs @@ -0,0 +1,327 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.CharacterReferences +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +Functions for parsing character references. +-} +module Text.Pandoc.CharacterReferences ( +                     characterReference, +                     decodeCharacterReferences, +                    ) where +import Data.Char ( chr ) +import Text.ParserCombinators.Parsec +import qualified Data.Map as Map + +-- | Parse character entity. +characterReference :: GenParser Char st Char +characterReference = try $ do +  st <- char '&' +  character <- numRef <|> entity +  end <- char ';' +  return character   + +numRef :: GenParser Char st Char +numRef = do +  char '#' +  num <- hexNum <|> decNum +  return $ chr $ num  + +hexNum :: GenParser Char st Int  +hexNum = oneOf "Xx" >> many1 hexDigit >>= return . read . ("0x" ++) + +decNum :: GenParser Char st Int  +decNum = many1 digit >>= return . read + +entity :: GenParser Char st Char +entity = do +  body <- many1 alphaNum +  return $ Map.findWithDefault '?' body entityTable + +-- | Convert entities in a string to characters. +decodeCharacterReferences :: String -> String +decodeCharacterReferences str =  +  case parse (many (characterReference <|> anyChar)) str str of +	Left err        -> error $ "\nError: " ++ show err +	Right result    -> result + +entityTable :: Map.Map String Char +entityTable = Map.fromList entityTableList + +entityTableList :: [(String, Char)] +entityTableList =  [ +	("quot", chr 34), +	("amp", chr 38), +	("lt", chr 60), +	("gt", chr 62), +	("nbsp", chr 160), +	("iexcl", chr 161), +	("cent", chr 162), +	("pound", chr 163), +	("curren", chr 164), +	("yen", chr 165), +	("brvbar", chr 166), +	("sect", chr 167), +	("uml", chr 168), +	("copy", chr 169), +	("ordf", chr 170), +	("laquo", chr 171), +	("not", chr 172), +	("shy", chr 173), +	("reg", chr 174), +	("macr", chr 175), +	("deg", chr 176), +	("plusmn", chr 177), +	("sup2", chr 178), +	("sup3", chr 179), +	("acute", chr 180), +	("micro", chr 181), +	("para", chr 182), +	("middot", chr 183), +	("cedil", chr 184), +	("sup1", chr 185), +	("ordm", chr 186), +	("raquo", chr 187), +	("frac14", chr 188), +	("frac12", chr 189), +	("frac34", chr 190), +	("iquest", chr 191), +	("Agrave", chr 192), +	("Aacute", chr 193), +	("Acirc", chr 194), +	("Atilde", chr 195), +	("Auml", chr 196), +	("Aring", chr 197), +	("AElig", chr 198), +	("Ccedil", chr 199), +	("Egrave", chr 200), +	("Eacute", chr 201), +	("Ecirc", chr 202), +	("Euml", chr 203), +	("Igrave", chr 204), +	("Iacute", chr 205), +	("Icirc", chr 206), +	("Iuml", chr 207), +	("ETH", chr 208), +	("Ntilde", chr 209), +	("Ograve", chr 210), +	("Oacute", chr 211), +	("Ocirc", chr 212), +	("Otilde", chr 213), +	("Ouml", chr 214), +	("times", chr 215), +	("Oslash", chr 216), +	("Ugrave", chr 217), +	("Uacute", chr 218), +	("Ucirc", chr 219), +	("Uuml", chr 220), +	("Yacute", chr 221), +	("THORN", chr 222), +	("szlig", chr 223), +	("agrave", chr 224), +	("aacute", chr 225), +	("acirc", chr 226), +	("atilde", chr 227), +	("auml", chr 228), +	("aring", chr 229), +	("aelig", chr 230), +	("ccedil", chr 231), +	("egrave", chr 232), +	("eacute", chr 233), +	("ecirc", chr 234), +	("euml", chr 235), +	("igrave", chr 236), +	("iacute", chr 237), +	("icirc", chr 238), +	("iuml", chr 239), +	("eth", chr 240), +	("ntilde", chr 241), +	("ograve", chr 242), +	("oacute", chr 243), +	("ocirc", chr 244), +	("otilde", chr 245), +	("ouml", chr 246), +	("divide", chr 247), +	("oslash", chr 248), +	("ugrave", chr 249), +	("uacute", chr 250), +	("ucirc", chr 251), +	("uuml", chr 252), +	("yacute", chr 253), +	("thorn", chr 254), +	("yuml", chr 255), +	("OElig", chr 338), +	("oelig", chr 339), +	("Scaron", chr 352), +	("scaron", chr 353), +	("Yuml", chr 376), +	("fnof", chr 402), +	("circ", chr 710), +	("tilde", chr 732), +	("Alpha", chr 913), +	("Beta", chr 914), +	("Gamma", chr 915), +	("Delta", chr 916), +	("Epsilon", chr 917), +	("Zeta", chr 918), +	("Eta", chr 919), +	("Theta", chr 920), +	("Iota", chr 921), +	("Kappa", chr 922), +	("Lambda", chr 923), +	("Mu", chr 924), +	("Nu", chr 925), +	("Xi", chr 926), +	("Omicron", chr 927), +	("Pi", chr 928), +	("Rho", chr 929), +	("Sigma", chr 931), +	("Tau", chr 932), +	("Upsilon", chr 933), +	("Phi", chr 934), +	("Chi", chr 935), +	("Psi", chr 936), +	("Omega", chr 937), +	("alpha", chr 945), +	("beta", chr 946), +	("gamma", chr 947), +	("delta", chr 948), +	("epsilon", chr 949), +	("zeta", chr 950), +	("eta", chr 951), +	("theta", chr 952), +	("iota", chr 953), +	("kappa", chr 954), +	("lambda", chr 955), +	("mu", chr 956), +	("nu", chr 957), +	("xi", chr 958), +	("omicron", chr 959), +	("pi", chr 960), +	("rho", chr 961), +	("sigmaf", chr 962), +	("sigma", chr 963), +	("tau", chr 964), +	("upsilon", chr 965), +	("phi", chr 966), +	("chi", chr 967), +	("psi", chr 968), +	("omega", chr 969), +	("thetasym", chr 977), +	("upsih", chr 978), +	("piv", chr 982), +	("ensp", chr 8194), +	("emsp", chr 8195), +	("thinsp", chr 8201), +	("zwnj", chr 8204), +	("zwj", chr 8205), +	("lrm", chr 8206), +	("rlm", chr 8207), +	("ndash", chr 8211), +	("mdash", chr 8212), +	("lsquo", chr 8216), +	("rsquo", chr 8217), +	("sbquo", chr 8218), +	("ldquo", chr 8220), +	("rdquo", chr 8221), +	("bdquo", chr 8222), +	("dagger", chr 8224), +	("Dagger", chr 8225), +	("bull", chr 8226), +	("hellip", chr 8230), +	("permil", chr 8240), +	("prime", chr 8242), +	("Prime", chr 8243), +	("lsaquo", chr 8249), +	("rsaquo", chr 8250), +	("oline", chr 8254), +	("frasl", chr 8260), +	("euro", chr 8364), +	("image", chr 8465), +	("weierp", chr 8472), +	("real", chr 8476), +	("trade", chr 8482), +	("alefsym", chr 8501), +	("larr", chr 8592), +	("uarr", chr 8593), +	("rarr", chr 8594), +	("darr", chr 8595), +	("harr", chr 8596), +	("crarr", chr 8629), +	("lArr", chr 8656), +	("uArr", chr 8657), +	("rArr", chr 8658), +	("dArr", chr 8659), +	("hArr", chr 8660), +	("forall", chr 8704), +	("part", chr 8706), +	("exist", chr 8707), +	("empty", chr 8709), +	("nabla", chr 8711), +	("isin", chr 8712), +	("notin", chr 8713), +	("ni", chr 8715), +	("prod", chr 8719), +	("sum", chr 8721), +	("minus", chr 8722), +	("lowast", chr 8727), +	("radic", chr 8730), +	("prop", chr 8733), +	("infin", chr 8734), +	("ang", chr 8736), +	("and", chr 8743), +	("or", chr 8744), +	("cap", chr 8745), +	("cup", chr 8746), +	("int", chr 8747), +	("there4", chr 8756), +	("sim", chr 8764), +	("cong", chr 8773), +	("asymp", chr 8776), +	("ne", chr 8800), +	("equiv", chr 8801), +	("le", chr 8804), +	("ge", chr 8805), +	("sub", chr 8834), +	("sup", chr 8835), +	("nsub", chr 8836), +	("sube", chr 8838), +	("supe", chr 8839), +	("oplus", chr 8853), +	("otimes", chr 8855), +	("perp", chr 8869), +	("sdot", chr 8901), +	("lceil", chr 8968), +	("rceil", chr 8969), +	("lfloor", chr 8970), +	("rfloor", chr 8971), +	("lang", chr 9001), +	("rang", chr 9002), +	("loz", chr 9674), +	("spades", chr 9824), +	("clubs", chr 9827), +	("hearts", chr 9829), +	("diams", chr 9830) +	] diff --git a/src/Text/Pandoc/Definition.hs b/src/Text/Pandoc/Definition.hs new file mode 100644 index 000000000..7d1125c5a --- /dev/null +++ b/src/Text/Pandoc/Definition.hs @@ -0,0 +1,116 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Definition +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +Definition of 'Pandoc' data structure for format-neutral representation +of documents. +-} +module Text.Pandoc.Definition where + +data Pandoc = Pandoc Meta [Block] deriving (Eq, Read, Show) + +-- | Bibliographic information for the document:  title (list of 'Inline'), +-- authors (list of strings), date (string). +data Meta = Meta [Inline] -- title +                 [String] -- authors +                 String   -- date +            deriving (Eq, Show, Read) + +-- | Alignment of a table column. +data Alignment = AlignLeft  +               | AlignRight  +               | AlignCenter  +               | AlignDefault deriving (Eq, Show, Read) + +-- | List attributes. +type ListAttributes = (Int, ListNumberStyle, ListNumberDelim) + +-- | Style of list numbers. +data ListNumberStyle = DefaultStyle +                     | Decimal  +                     | LowerRoman  +                     | UpperRoman +                     | LowerAlpha  +                     | UpperAlpha deriving (Eq, Show, Read) + +-- | Delimiter of list numbers. +data ListNumberDelim = DefaultDelim +                     | Period +                     | OneParen  +                     | TwoParens deriving (Eq, Show, Read) +                    +-- | Block element. +data Block   +    = Plain [Inline]        -- ^ Plain text, not a paragraph +    | Para [Inline]         -- ^ Paragraph +    | CodeBlock String      -- ^ Code block (literal) +    | RawHtml String        -- ^ Raw HTML block (literal) +    | BlockQuote [Block]    -- ^ Block quote (list of blocks) +    | OrderedList ListAttributes [[Block]] -- ^ Ordered list (attributes +                            -- and a list of items, each a list of blocks) +    | BulletList [[Block]]  -- ^ Bullet list (list of items, each +                            -- a list of blocks) +    | DefinitionList [([Inline],[Block])]  -- ^ Definition list  +                            -- (list of items, each a pair of an inline list, +                            -- the term, and a block list) +    | Header Int [Inline]   -- ^ Header - level (integer) and text (inlines)  +    | HorizontalRule        -- ^ Horizontal rule +    | Table [Inline] [Alignment] [Float] [[Block]] [[[Block]]]  -- ^ Table, +                            -- with caption, column alignments, +                            -- relative column widths, column headers +                            -- (each a list of blocks), and rows +                            -- (each a list of lists of blocks) +    | Null                  -- ^ Nothing +    deriving (Eq, Read, Show) + +-- | Type of quotation marks to use in Quoted inline. +data QuoteType = SingleQuote | DoubleQuote deriving (Show, Eq, Read) + +type Target = (String, String)  -- ^ Link target (URL, title) + +-- | Inline elements. +data Inline  +    = Str String            -- ^ Text (string) +    | Emph [Inline]         -- ^ Emphasized text (list of inlines) +    | Strong [Inline]       -- ^ Strongly emphasized text (list of inlines) +    | Strikeout [Inline]    -- ^ Strikeout text (list of inlines) +    | Superscript [Inline]  -- ^ Superscripted text (list of inlines) +    | Subscript [Inline]    -- ^ Subscripted text (list of inlines) +    | Quoted QuoteType [Inline] -- ^ Quoted text (list of inlines) +    | Code String           -- ^ Inline code (literal) +    | Space                 -- ^ Inter-word space +    | EmDash                -- ^ Em dash +    | EnDash                -- ^ En dash +    | Apostrophe            -- ^ Apostrophe +    | Ellipses              -- ^ Ellipses +    | LineBreak             -- ^ Hard line break +    | TeX String            -- ^ LaTeX code (literal) +    | HtmlInline String     -- ^ HTML code (literal) +    | Link [Inline] Target  -- ^ Hyperlink: text (list of inlines), target +    | Image [Inline] Target -- ^ Image:  alt text (list of inlines), target +                            -- and target +    | Note [Block]          -- ^ Footnote or endnote  +    deriving (Show, Eq, Read) diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs new file mode 100644 index 000000000..70a071152 --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -0,0 +1,496 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Readers.HTML +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha  +   Portability : portable + +Conversion of HTML to 'Pandoc' document. +-} +module Text.Pandoc.Readers.HTML (  +                                 readHtml,  +                                 rawHtmlInline,  +                                 rawHtmlBlock,  +                                 anyHtmlBlockTag,  +                                 anyHtmlInlineTag,   +                                 anyHtmlTag, +                                 anyHtmlEndTag, +                                 htmlEndTag, +                                 extractTagType, +                                 htmlBlockElement  +                                ) where + +import Text.ParserCombinators.Parsec +import Text.Pandoc.Definition +import Text.Pandoc.Shared  +import Text.Pandoc.CharacterReferences ( characterReference,  +                                         decodeCharacterReferences ) +import Data.Maybe ( fromMaybe ) +import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf ) +import Data.Char ( toUpper, toLower, isAlphaNum ) + +-- | Convert HTML-formatted string to 'Pandoc' document. +readHtml :: ParserState   -- ^ Parser state +         -> String        -- ^ String to parse +         -> Pandoc +readHtml = readWith parseHtml + +-- +-- Constants +-- + +eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", +                  "map", "area", "object", "script"] + +inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", +                  "br", "cite", "code", "dfn", "em", "font", "i", "img", +                  "input", "kbd", "label", "q", "s", "samp", "select", +                  "small", "span", "strike", "strong", "sub", "sup", +                  "textarea", "tt", "u", "var"] ++ eitherBlockOrInline + +blockHtmlTags = ["address", "blockquote", "center", "dir", "div", +                 "dl", "fieldset", "form", "h1", "h2", "h3", "h4", +                 "h5", "h6", "hr", "isindex", "menu", "noframes", +                 "noscript", "ol", "p", "pre", "table", "ul", "dd", +                 "dt", "frameset", "li", "tbody", "td", "tfoot", +                 "th", "thead", "tr"] ++ eitherBlockOrInline + +-- +-- HTML utility functions +-- + +-- | Read blocks until end tag. +blocksTilEnd tag = do +  blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) +  return $ filter (/= Null) blocks + +-- | Read inlines until end tag. +inlinesTilEnd tag = manyTill inline (htmlEndTag tag) + +-- | Parse blocks between open and close tag. +blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag + +-- | Parse inlines between open and close tag. +inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag + +-- | Extract type from a tag:  e.g. @br@ from @\<br\>@ +extractTagType :: String -> String +extractTagType ('<':rest) =  +  let isSpaceOrSlash c = c `elem` "/ \n\t" in +  map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest +extractTagType _ = "" + +-- | Parse any HTML tag (opening or self-closing) and return text of tag +anyHtmlTag = try $ do +  char '<' +  spaces +  tag <- many1 alphaNum +  attribs <- many htmlAttribute +  spaces +  ender <- option "" (string "/") +  let ender' = if null ender then "" else " /" +  spaces +  char '>' +  return $ "<" ++ tag ++  +           concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" + +anyHtmlEndTag = try $ do +  char '<'    +  spaces +  char '/' +  spaces +  tagType <- many1 alphaNum +  spaces +  char '>' +  return $ "</" ++ tagType ++ ">" + +htmlTag :: String -> GenParser Char st (String, [(String, String)]) +htmlTag tag = try $ do +  char '<' +  spaces +  stringAnyCase tag +  attribs <- many htmlAttribute +  spaces +  optional (string "/") +  spaces +  char '>' +  return (tag, (map (\(name, content, raw) -> (name, content)) attribs)) + +-- parses a quoted html attribute value +quoted quoteChar = do +  result <- between (char quoteChar) (char quoteChar)  +                    (many (noneOf [quoteChar])) +  return (result, [quoteChar]) + +htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute + +-- minimized boolean attribute +htmlMinimizedAttribute = try $ do +  many1 space +  name <- many1 (choice [letter, oneOf ".-_:"]) +  return (name, name, name) + +htmlRegularAttribute = try $ do +  many1 space +  name <- many1 (choice [letter, oneOf ".-_:"]) +  spaces +  char '=' +  spaces +  (content, quoteStr) <- choice [ (quoted '\''),  +                                  (quoted '"'),  +                                  (do +                                     a <- many (alphaNum <|> (oneOf "-._:")) +                                     return (a,"")) ] +  return (name, content, +          (name ++ "=" ++ quoteStr ++ content ++ quoteStr)) + +-- | Parse an end tag of type 'tag' +htmlEndTag tag = try $ do +  char '<'    +  spaces +  char '/' +  spaces +  stringAnyCase tag +  spaces +  char '>' +  return $ "</" ++ tag ++ ">" + +-- | Returns @True@ if the tag is (or can be) an inline tag. +isInline tag = (extractTagType tag) `elem` inlineHtmlTags + +-- | Returns @True@ if the tag is (or can be) a block tag. +isBlock tag = (extractTagType tag) `elem` blockHtmlTags  + +anyHtmlBlockTag = try $ do +  tag <- anyHtmlTag <|> anyHtmlEndTag +  if isBlock tag then return tag else fail "inline tag" + +anyHtmlInlineTag = try $ do +  tag <- anyHtmlTag <|> anyHtmlEndTag +  if isInline tag then return tag else fail "not an inline tag" + +-- | Parses material between script tags. +-- Scripts must be treated differently, because they can contain '<>' etc. +htmlScript = try $ do +  open <- string "<script" +  rest <- manyTill anyChar (htmlEndTag "script") +  return $ open ++ rest ++ "</script>" + +htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ] + +rawHtmlBlock = try $ do +  notFollowedBy' (htmlTag "/body" <|> htmlTag "/html") +  body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag +  sp <- many space +  state <- getState +  if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null + +-- | Parses an HTML comment. +htmlComment = try $ do +  string "<!--" +  comment <- manyTill anyChar (try (string "-->")) +  return $ "<!--" ++ comment ++ "-->" + +-- +-- parsing documents +-- + +xmlDec = try $ do +  string "<?" +  rest <- manyTill anyChar (char '>') +  return $ "<?" ++ rest ++ ">" + +definition = try $ do +  string "<!" +  rest <- manyTill anyChar (char '>') +  return $ "<!" ++ rest ++ ">" + +nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >> +  ((rawHtmlBlock >> return ' ') <|> anyChar) + +parseTitle = try $ do +  (tag, _) <- htmlTag "title" +  contents <- inlinesTilEnd tag +  spaces +  return contents + +-- parse header and return meta-information (for now, just title) +parseHead = try $ do +  htmlTag "head" +  spaces +  skipMany nonTitleNonHead +  contents <- option [] parseTitle +  skipMany nonTitleNonHead +  htmlTag "/head" +  return (contents, [], "") + +skipHtmlTag tag = optional (htmlTag tag) + +-- h1 class="title" representation of title in body +bodyTitle = try $ do +  (tag, attribs) <- htmlTag "h1"   +  cl <- case (extractAttribute "class" attribs) of +          Just "title" -> return "" +          otherwise    -> fail "not title" +  inlinesTilEnd "h1" + +parseHtml = do +  sepEndBy (choice [xmlDec, definition, htmlComment]) spaces +  skipHtmlTag "html" +  spaces +  (title, authors, date) <- option ([], [], "") parseHead  +  spaces +  skipHtmlTag "body" +  spaces +  optional bodyTitle  -- skip title in body, because it's represented in meta +  blocks <- parseBlocks +  spaces +  optional (htmlEndTag "body") +  spaces +  optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html> +  eof +  return $ Pandoc (Meta title authors date) blocks + +-- +-- parsing blocks +-- + +parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null)) + +block = choice [ codeBlock +               , header +               , hrule +               , list +               , blockQuote +               , para +               , plain +               , rawHtmlBlock ] <?> "block" + +-- +-- header blocks +-- + +header = choice (map headerLevel (enumFromTo 1 5)) <?> "header" + +headerLevel n = try $ do +    let level = "h" ++ show n +    (tag, attribs) <- htmlTag level +    contents <- inlinesTilEnd level +    return $ Header n (normalizeSpaces contents) + +-- +-- hrule block +-- + +hrule = try  $ do +  (tag, attribs) <- htmlTag "hr" +  state <- getState +  if not (null attribs) && stateParseRaw state +     then unexpected "attributes in hr" -- parse as raw in this case +     else return HorizontalRule + +-- +-- code blocks +-- + +-- Note:  HTML tags in code blocks (e.g. for syntax highlighting) are  +-- skipped, because they are not portable to output formats other than HTML. +codeBlock = try $ do +    htmlTag "pre"  +    result <- manyTill  +              (many1 (satisfy (/= '<')) <|>  +               ((anyHtmlTag <|> anyHtmlEndTag) >> return "")) +              (htmlEndTag "pre") +    let result' = concat result +    -- drop leading newline if any +    let result'' = if "\n" `isPrefixOf` result' +                      then drop 1 result' +                      else result' +    -- drop trailing newline if any +    let result''' = if "\n" `isSuffixOf` result'' +                       then init result'' +                       else result'' +    return $ CodeBlock $ decodeCharacterReferences result''' + +-- +-- block quotes +-- + +blockQuote = try $ htmlTag "blockquote" >> spaces >>  +                   blocksTilEnd "blockquote" >>= (return . BlockQuote) + +-- +-- list blocks +-- + +list = choice [ bulletList, orderedList, definitionList ] <?> "list" + +orderedList = try $ do +  (_, attribs) <- htmlTag "ol" +  (start, style) <- option (1, DefaultStyle) $ +                           do failIfStrict +                              let sta = fromMaybe "1" $  +                                        lookup "start" attribs +                              let sty = fromMaybe (fromMaybe "" $ +                                        lookup "style" attribs) $ +                                        lookup "class" attribs +                              let sty' = case sty of +                                          "lower-roman"  -> LowerRoman +                                          "upper-roman"  -> UpperRoman +                                          "lower-alpha"  -> LowerAlpha +                                          "upper-alpha"  -> UpperAlpha +                                          "decimal"      -> Decimal +                                          _              -> DefaultStyle +                              return (read sta, sty') +  spaces +  items <- sepEndBy1 (blocksIn "li") spaces +  htmlEndTag "ol" +  return $ OrderedList (start, style, DefaultDelim) items + +bulletList = try $ do +  htmlTag "ul" +  spaces +  items <- sepEndBy1 (blocksIn "li") spaces +  htmlEndTag "ul" +  return $ BulletList items + +definitionList = try $ do +  failIfStrict  -- def lists not part of standard markdown +  tag <- htmlTag "dl" +  spaces +  items <- sepEndBy1 definitionListItem spaces +  htmlEndTag "dl" +  return $ DefinitionList items + +definitionListItem = try $ do +  terms <- sepEndBy1 (inlinesIn "dt") spaces +  defs <- sepEndBy1 (blocksIn "dd") spaces +  let term = joinWithSep [LineBreak] terms +  return (term, concat defs) + +-- +-- paragraph block +-- + +para = try $ htmlTag "p" >> inlinesTilEnd "p" >>=  +             return . Para . normalizeSpaces + +--  +-- plain block +-- + +plain = many1 inline >>= return . Plain . normalizeSpaces + +--  +-- inline +-- + +inline = choice [ charRef +                , strong +                , emph +                , superscript +                , subscript +                , strikeout +                , spanStrikeout +                , code +                , str +                , linebreak +                , whitespace +                , link +                , image +                , rawHtmlInline +                ] <?> "inline" + +code = try $ do  +  htmlTag "code" +  result <- manyTill anyChar (htmlEndTag "code") +  -- remove internal line breaks, leading and trailing space, +  -- and decode character references +  return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $  +                  joinWithSep " " $ lines result  + +rawHtmlInline = do +  result <- htmlScript <|> htmlComment <|> anyHtmlInlineTag +  state <- getState +  if stateParseRaw state then return (HtmlInline result) else return (Str "") + +betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>=  +                        return . normalizeSpaces + +emph = (betweenTags "em" <|> betweenTags "it") >>= return . Emph + +strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong + +superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript + +subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript + +strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>= +            return . Strikeout + +spanStrikeout = try $ do +  failIfStrict -- strict markdown has no strikeout, so treat as raw HTML +  (tag, attributes) <- htmlTag "span"  +  result <- case (extractAttribute "class" attributes) of +              Just "strikeout" -> inlinesTilEnd "span" +              _                -> fail "not a strikeout" +  return $ Strikeout result + +whitespace = many1 space >> return Space + +-- hard line break +linebreak = htmlTag "br" >> optional newline >> return LineBreak + +str = many1 (noneOf "<& \t\n") >>= return . Str + +-- +-- links and images +-- + +-- extract contents of attribute (attribute names are case-insensitive) +extractAttribute name [] = Nothing +extractAttribute name ((attrName, contents):rest) =  +  let name'     = map toLower name  +      attrName' = map toLower attrName +  in  if attrName' == name' +         then Just (decodeCharacterReferences contents) +         else extractAttribute name rest + +link = try $ do +  (tag, attributes) <- htmlTag "a"   +  url <- case (extractAttribute "href" attributes) of +           Just url -> return url +           Nothing  -> fail "no href" +  let title = fromMaybe "" $ extractAttribute "title" attributes +  label <- inlinesTilEnd "a" +  return $ Link (normalizeSpaces label) (url, title) + +image = try $ do +  (tag, attributes) <- htmlTag "img"  +  url <- case (extractAttribute "src" attributes) of +           Just url -> return url +           Nothing  -> fail "no src" +  let title = fromMaybe "" $ extractAttribute "title" attributes +  let alt = fromMaybe "" (extractAttribute "alt" attributes) +  return $ Image [Str alt] (url, title) + diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs new file mode 100644 index 000000000..37cc2bfe4 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -0,0 +1,651 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Readers.LaTeX +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +Conversion of LaTeX to 'Pandoc' document. +-} +module Text.Pandoc.Readers.LaTeX (  +                                  readLaTeX, +                                  rawLaTeXInline, +                                  rawLaTeXEnvironment +                                 ) where + +import Text.ParserCombinators.Parsec +import Text.Pandoc.Definition +import Text.Pandoc.Shared  +import Data.Maybe ( fromMaybe ) +import Data.Char ( chr ) +import Data.List ( isPrefixOf, isSuffixOf ) + +-- | Parse LaTeX from string and return 'Pandoc' document. +readLaTeX :: ParserState   -- ^ Parser state, including options for parser +          -> String        -- ^ String to parse +          -> Pandoc +readLaTeX = readWith parseLaTeX + +-- characters with special meaning +specialChars = "\\`$%^&_~#{}\n \t|<>'\"-" + +-- +-- utility functions +-- + +-- | Returns text between brackets and its matching pair. +bracketedText openB closeB = do +  result <- charsInBalanced' openB closeB +  return $ [openB] ++ result ++ [closeB] + +-- | Returns an option or argument of a LaTeX command. +optOrArg = bracketedText '{' '}' <|> bracketedText '[' ']' + +-- | True if the string begins with '{'. +isArg ('{':rest) = True +isArg other = False + +-- | Returns list of options and arguments of a LaTeX command. +commandArgs = many optOrArg + +-- | Parses LaTeX command, returns (name, star, list of options or arguments). +command = do +  char '\\' +  name <- many1 letter +  star <- option "" (string "*")  -- some commands have starred versions +  args <- commandArgs +  return (name, star, args) + +begin name = try $ do +  string $ "\\begin{" ++ name ++ "}" +  optional commandArgs +  spaces +  return name + +end name = try $ do +  string $ "\\end{" ++ name ++ "}" +  spaces +  return name + +-- | Returns a list of block elements containing the contents of an +-- environment. +environment name = try $ begin name >> spaces >> manyTill block (end name) + +anyEnvironment =  try $ do +  string "\\begin{" +  name <- many letter +  star <- option "" (string "*") -- some environments have starred variants +  char '}' +  optional commandArgs +  spaces +  contents <- manyTill block (end (name ++ star)) +  return $ BlockQuote contents + +-- +-- parsing documents +-- + +-- | Process LaTeX preamble, extracting metadata. +processLaTeXPreamble = try $ manyTill  +  (choice [bibliographic, comment, unknownCommand, nullBlock])  +  (try (string "\\begin{document}")) >>  +  spaces + +-- | Parse LaTeX and return 'Pandoc'. +parseLaTeX = do +  optional processLaTeXPreamble -- preamble might not be present (fragment) +  spaces +  blocks <- parseBlocks +  spaces +  optional $ try (string "\\end{document}" >> many anyChar)  +  -- might not be present (fragment) +  spaces +  eof +  state <- getState +  let blocks' = filter (/= Null) blocks +  let title' = stateTitle state +  let authors' = stateAuthors state +  let date' = stateDate state +  return $ Pandoc (Meta title' authors' date')  blocks' + +-- +-- parsing blocks +-- + +parseBlocks = spaces >> many block + +block = choice [ hrule +               , codeBlock +               , header +               , list +               , blockQuote +               , mathBlock +               , comment +               , bibliographic +               , para +               , specialEnvironment +               , itemBlock +               , unknownEnvironment +               , unknownCommand ] <?> "block" + +-- +-- header blocks +-- + +header = try $ do +  char '\\' +  subs <- many (try (string "sub")) +  string "section" +  optional (char '*') +  char '{' +  title <- manyTill inline (char '}') +  spaces +  return $ Header (length subs + 1) (normalizeSpaces title) + +-- +-- hrule block +-- + +hrule = oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n",  +                       "\\newpage" ] >> spaces >> return HorizontalRule + +-- +-- code blocks +-- + +codeBlock = codeBlock1 <|> codeBlock2 + +codeBlock1 = try $ do +  string "\\begin{verbatim}"  -- don't use begin function because it  +                              -- gobbles whitespace +  optional blanklines         -- we want to gobble blank lines, but not  +                              -- leading space +  contents <- manyTill anyChar (try (string "\\end{verbatim}")) +  spaces +  return $ CodeBlock (stripTrailingNewlines contents) + +codeBlock2 = try $ do +  string "\\begin{Verbatim}"  -- used by fancyvrb package +  option "" blanklines +  contents <- manyTill anyChar (try (string "\\end{Verbatim}")) +  spaces +  return $ CodeBlock (stripTrailingNewlines contents) + +-- +-- block quotes +-- + +blockQuote = (environment "quote" <|> environment "quotation") >>~ spaces >>=  +             return . BlockQuote + +-- +-- math block +-- + +mathBlock = mathBlockWith (begin "equation") (end "equation") <|>  +            mathBlockWith (begin "displaymath") (end "displaymath") <|> +            mathBlockWith (string "\\[") (string "\\]") <?> "math block" + +mathBlockWith start end = try $ do +  start +  spaces +  result <- manyTill anyChar end +  spaces +  return $ BlockQuote [Para [TeX ("$" ++ result ++ "$")]] + +-- +-- list blocks +-- + +list = bulletList <|> orderedList <|> definitionList <?> "list" + +listItem = try $ do +  ("item", _, args) <- command +  spaces +  state <- getState +  let oldParserContext = stateParserContext state +  updateState (\state -> state {stateParserContext = ListItemState}) +  blocks <- many block +  updateState (\state -> state {stateParserContext = oldParserContext}) +  opt <- case args of +           ([x]) | "[" `isPrefixOf` x && "]" `isSuffixOf` x ->  +                       parseFromString (many inline) $ tail $ init x +           _        -> return [] +  return (opt, blocks) + +orderedList = try $ do +  string "\\begin{enumerate}" +  (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $ +                              try $ do failIfStrict +                                       char '[' +                                       res <- anyOrderedListMarker +                                       char ']' +                                       return res +  spaces +  option "" $ try $ do string "\\setlength{\\itemindent}" +                       char '{' +                       manyTill anyChar (char '}') +  spaces +  start <- option 1 $ try $ do failIfStrict +                               string "\\setcounter{enum" +                               many1 (oneOf "iv") +                               string "}{" +                               num <- many1 digit +                               char '}'  +                               spaces +                               return $ (read num) + 1 +  items <- many listItem +  end "enumerate" +  spaces +  return $ OrderedList (start, style, delim) $ map snd items + +bulletList = try $ do +  begin "itemize" +  spaces +  items <- many listItem +  end "itemize" +  spaces +  return (BulletList $ map snd items) + +definitionList = try $ do +  begin "description" +  spaces +  items <- many listItem +  end "description" +  spaces +  return (DefinitionList items) + +-- +-- paragraph block +-- + +para = many1 inline >>~ spaces >>= return . Para . normalizeSpaces + +-- +-- title authors date +-- + +bibliographic = choice [ maketitle, title, authors, date ] + +maketitle = try (string "\\maketitle") >> spaces >> return Null + +title = try $ do +  string "\\title{" +  tit <- manyTill inline (char '}') +  spaces +  updateState (\state -> state { stateTitle = tit }) +  return Null + +authors = try $ do +  string "\\author{" +  authors <- manyTill anyChar (char '}') +  spaces +  let authors' = map removeLeadingTrailingSpace $ lines $ +                 substitute "\\\\" "\n" authors +  updateState (\state -> state { stateAuthors = authors' }) +  return Null + +date = try $ do +  string "\\date{" +  date' <- manyTill anyChar (char '}') +  spaces +  updateState (\state -> state { stateDate = date' }) +  return Null + +-- +-- item block +-- for use in unknown environments that aren't being parsed as raw latex +-- + +-- this forces items to be parsed in different blocks +itemBlock = try $ do +  ("item", _, args) <- command +  state <- getState +  if (stateParserContext state == ListItemState) +     then fail "item should be handled by list block" +     else if null args  +             then return Null +             else return $ Plain [Str (stripFirstAndLast (head args))] + +-- +-- raw LaTeX  +-- + +specialEnvironment = do  -- these are always parsed as raw +  lookAhead (choice (map (\name -> begin name)  ["tabular", "figure", +              "tabbing", "eqnarry", "picture", "table", "verse", "theorem"])) +  rawLaTeXEnvironment + +-- | Parse any LaTeX environment and return a Para block containing +-- the whole literal environment as raw TeX. +rawLaTeXEnvironment :: GenParser Char st Block +rawLaTeXEnvironment = try $ do +  string "\\begin{" +  name <- many1 letter +  star <- option "" (string "*") -- for starred variants +  let name' = name ++ star +  char '}' +  args <- option [] commandArgs +  let argStr = concat args +  contents <- manyTill (choice [ (many1 (noneOf "\\")),  +                                 (do  +                                    (Para [TeX str]) <- rawLaTeXEnvironment +                                    return str), +                                 string "\\" ])  +                       (end name') +  spaces +  return $ Para [TeX $ "\\begin{" ++ name' ++ "}" ++ argStr ++  +                 concat contents ++ "\\end{" ++ name' ++ "}"] + +unknownEnvironment = try $ do +  state <- getState +  result <- if stateParseRaw state -- check whether we should include raw TeX  +               then rawLaTeXEnvironment -- if so, get whole raw environment +               else anyEnvironment      -- otherwise just the contents +  return result + +unknownCommand = try $ do +  notFollowedBy' $ choice $ map end ["itemize", "enumerate", "description",  +                                     "document"] +  (name, star, args) <- command +  spaces +  let argStr = concat args +  state <- getState +  if name == "item" && (stateParserContext state) == ListItemState +     then fail "should not be parsed as raw" +     else string "" +  if stateParseRaw state +     then return $ Plain [TeX ("\\" ++ name ++ star ++ argStr)] +     else return $ Plain [Str (joinWithSep " " args)] + +-- latex comment +comment = try $ char '%' >> manyTill anyChar newline >> spaces >> return Null + +--  +-- inline +-- + +inline =  choice [ str +                 , endline +                 , whitespace +                 , quoted +                 , apostrophe +                 , spacer +                 , strong +                 , math +                 , ellipses +                 , emDash +                 , enDash +                 , hyphen +                 , emph +                 , strikeout +                 , superscript +                 , subscript +                 , ref +                 , lab +                 , code +                 , url +                 , link +                 , image +                 , footnote +                 , linebreak +                 , accentedChar +                 , specialChar +                 , rawLaTeXInline +                 , escapedChar +                 , unescapedChar +                 ] <?> "inline" + +accentedChar = normalAccentedChar <|> specialAccentedChar + +normalAccentedChar = try $ do +  char '\\' +  accent <- oneOf "'`^\"~" +  character <- (try $ char '{' >> letter >>~ char '}') <|> letter +  let table = fromMaybe [] $ lookup character accentTable  +  let result = case lookup accent table of +                 Just num  -> chr num +                 Nothing   -> '?' +  return $ Str [result] + +-- an association list of letters and association list of accents +-- and decimal character numbers. +accentTable =  +  [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]), +    ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]), +    ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]), +    ('N', [('~', 209)]), +    ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]), +    ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]), +    ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]), +    ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]), +    ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]), +    ('n', [('~', 241)]), +    ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]), +    ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ] + +specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, +                               oslash, pound, euro, copyright, sect ] + +ccedil = try $ do +  char '\\' +  letter <- oneOfStrings ["cc", "cC"] +  let num = if letter == "cc" then 231 else 199 +  return $ Str [chr num] + +aring = try $ do +  char '\\' +  letter <- oneOfStrings ["aa", "AA"] +  let num = if letter == "aa" then 229 else 197 +  return $ Str [chr num] + +iuml = try (string "\\\"") >> oneOfStrings ["\\i", "{\\i}"] >>  +       return (Str [chr 239]) + +icirc = try (string "\\^") >> oneOfStrings ["\\i", "{\\i}"] >> +        return (Str [chr 238]) + +szlig = try (string "\\ss") >> return (Str [chr 223]) + +oslash = try $ do +  char '\\' +  letter <- choice [char 'o', char 'O'] +  let num = if letter == 'o' then 248 else 216 +  return $ Str [chr num] + +aelig = try $ do +  char '\\' +  letter <- oneOfStrings ["ae", "AE"] +  let num = if letter == "ae" then 230 else 198 +  return $ Str [chr num] + +pound = try (string "\\pounds") >> return (Str [chr 163]) + +euro = try (string "\\euro") >> return (Str [chr 8364]) + +copyright = try (string "\\copyright") >> return (Str [chr 169]) + +sect = try (string "\\S") >> return (Str [chr 167]) + +escapedChar = do +  result <- escaped (oneOf " $%&_#{}\n") +  return $ if result == Str "\n" then Str " " else result + +-- ignore standalone, nonescaped special characters +unescapedChar = oneOf "`$^&_#{}|<>" >> return (Str "") + +specialChar = choice [ backslash, tilde, caret, bar, lt, gt, doubleQuote ] + +backslash = try (string "\\textbackslash") >> return (Str "\\") + +tilde = try (string "\\ensuremath{\\sim}") >> return (Str "~") + +caret = try (string "\\^{}") >> return (Str "^") + +bar = try (string "\\textbar") >> return (Str "\\") + +lt = try (string "\\textless") >> return (Str "<") + +gt = try (string "\\textgreater") >> return (Str ">") + +doubleQuote = char '"' >> return (Str "\"") + +code = code1 <|> code2 + +code1 = try $ do  +  string "\\verb" +  marker <- anyChar +  result <- manyTill anyChar (char marker) +  return $ Code $ removeLeadingTrailingSpace result + +code2 = try $ do +  string "\\texttt{" +  result <- manyTill (noneOf "\\\n~$%^&{}") (char '}') +  return $ Code result + +emph = try $ oneOfStrings [ "\\emph{", "\\textit{" ] >> +             manyTill inline (char '}') >>= return . Emph + +strikeout = try $ string "\\sout{" >> manyTill inline (char '}') >>= +                  return . Strikeout + +superscript = try $ string "\\textsuperscript{" >>  +                    manyTill inline (char '}') >>= return . Superscript + +-- note: \textsubscript isn't a standard latex command, but we use +-- a defined version in pandoc. +subscript = try $ string "\\textsubscript{" >> manyTill inline (char '}') >>= +                  return . Subscript + +apostrophe = char '\'' >> return Apostrophe + +quoted = doubleQuoted <|> singleQuoted + +singleQuoted = enclosed singleQuoteStart singleQuoteEnd inline >>= +               return . Quoted SingleQuote . normalizeSpaces + +doubleQuoted = enclosed doubleQuoteStart doubleQuoteEnd inline >>= +               return . Quoted DoubleQuote . normalizeSpaces + +singleQuoteStart = char '`' + +singleQuoteEnd = try $ char '\'' >> notFollowedBy alphaNum + +doubleQuoteStart = string "``" + +doubleQuoteEnd = try $ string "''" + +ellipses = try $ string "\\ldots" >> optional (try (string "{}")) >> +                 return Ellipses + +enDash = try (string "--") >> return EnDash + +emDash = try (string "---") >> return EmDash + +hyphen = char '-' >> return (Str "-") + +lab = try $ do +  string "\\label{" +  result <- manyTill anyChar (char '}') +  return $ Str $ "(" ++ result ++ ")" + +ref = try (string "\\ref{") >> manyTill anyChar (char '}') >>= return . Str + +strong = try (string "\\textbf{") >> manyTill inline (char '}') >>= +         return . Strong + +whitespace = many1 (oneOf "~ \t") >> return Space + +-- hard line break +linebreak = try (string "\\\\") >> return LineBreak + +spacer = try (string "\\,") >> return (Str "") + +str = many1 (noneOf specialChars) >>= return . Str + +-- endline internal to paragraph +endline = try $ newline >> notFollowedBy blankline >> return Space + +-- math +math = math1 <|> math2 <?> "math" + +math1 = try $ do +  char '$' +  result <- many (noneOf "$") +  char '$' +  return $ TeX ("$" ++ result ++ "$") + +math2 = try $ do +  string "\\(" +  result <- many (noneOf "$") +  string "\\)" +  return $ TeX ("$" ++ result ++ "$") + +-- +-- links and images +-- + +url = try $ do +  string "\\url" +  url <- charsInBalanced '{' '}' +  return $ Link [Code url] (url, "") + +link = try $ do +  string "\\href{" +  url <- manyTill anyChar (char '}') +  char '{' +  label <- manyTill inline (char '}')  +  return $ Link (normalizeSpaces label) (url, "") + +image = try $ do +  ("includegraphics", _, args) <- command +  let args' = filter isArg args -- filter out options +  let src = if null args' then +              ("", "") +            else +              (stripFirstAndLast (head args'), "") +  return $ Image [Str "image"] src + +footnote = try $ do +  (name, _, (contents:[])) <- command +  if ((name == "footnote") || (name == "thanks")) +     then string "" +     else fail "not a footnote or thanks command" +  let contents' = stripFirstAndLast contents +  -- parse the extracted block, which may contain various block elements: +  rest <- getInput +  setInput $ contents' +  blocks <- parseBlocks +  setInput rest +  return $ Note blocks + +-- | Parse any LaTeX command and return it in a raw TeX inline element. +rawLaTeXInline :: GenParser Char ParserState Inline +rawLaTeXInline = try $ do +  (name, star, args) <- command +  state <- getState +  if ((name == "begin") || (name == "end") || (name == "item")) +     then fail "not an inline command"  +     else string "" +  return $ TeX ("\\" ++ name ++ star ++ concat args) + diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs new file mode 100644 index 000000000..df84c0ac7 --- /dev/null +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -0,0 +1,909 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Readers.Markdown +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +Conversion of markdown-formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Markdown (  +                                     readMarkdown  +                                    ) where + +import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy ) +import Data.Ord ( comparing ) +import Data.Char ( isAlphaNum ) +import Network.URI ( isURI ) +import Text.Pandoc.Definition +import Text.Pandoc.Shared  +import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment ) +import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,  +                                  anyHtmlInlineTag, anyHtmlTag, +                                  anyHtmlEndTag, htmlEndTag, extractTagType, +                                  htmlBlockElement ) +import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) +import Text.ParserCombinators.Parsec + +-- | Read markdown from an input string and return a Pandoc document. +readMarkdown :: ParserState -> String -> Pandoc +readMarkdown state str = (readWith parseMarkdown) state (str ++ "\n\n") + +-- +-- Constants and data structure definitions +-- + +spaceChars = " \t" +bulletListMarkers = "*+-" +hruleChars = "*-_" +setextHChars = "=-" + +-- treat these as potentially non-text when parsing inline: +specialChars = "\\[]*_~`<>$!^-.&'\"" + +-- +-- auxiliary functions +-- + +indentSpaces = try $ do +  state <- getState +  let tabStop = stateTabStop state +  try (count tabStop (char ' ')) <|>  +    (many (char ' ') >> string "\t") <?> "indentation" + +nonindentSpaces = do +  state <- getState +  let tabStop = stateTabStop state +  sps <- many (char ' ') +  if length sps < tabStop  +     then return sps +     else unexpected "indented line" + +-- | Fail unless we're at beginning of a line. +failUnlessBeginningOfLine = do +  pos <- getPosition +  if sourceColumn pos == 1 then return () else fail "not beginning of line" + +-- | Fail unless we're in "smart typography" mode. +failUnlessSmart = do +  state <- getState +  if stateSmart state then return () else fail "Smart typography feature" + +-- | Parse an inline Str element with a given content. +inlineString str = try $ do  +  (Str res) <- inline  +  if res == str then return res else fail $ "unexpected Str content" + +-- | Parse a sequence of inline elements between a string +--  @opener@ and a string @closer@, including inlines +--  between balanced pairs of @opener@ and a @closer@. +inlinesInBalanced :: String -> String -> GenParser Char ParserState [Inline] +inlinesInBalanced opener closer = try $ do +  string opener +  result <- manyTill ( (do lookAhead (inlineString opener) +                           -- because it might be a link... +                           bal <- inlinesInBalanced opener closer  +                           return $ [Str opener] ++ bal ++ [Str closer]) +                       <|> (count 1 inline))  +                     (try (string closer)) +  return $ concat result + +-- +-- document structure +-- + +titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline + +authorsLine = try $ do  +  char '%' +  skipSpaces +  authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;") +  newline +  return $ map (decodeCharacterReferences . removeLeadingTrailingSpace) authors + +dateLine = try $ do +  char '%' +  skipSpaces +  date <- many (noneOf "\n") +  newline +  return $ decodeCharacterReferences $ removeTrailingSpace date + +titleBlock = try $ do +  failIfStrict +  title <- option [] titleLine +  author <- option [] authorsLine +  date <- option "" dateLine +  optional blanklines +  return (title, author, date) + +parseMarkdown = do +  -- markdown allows raw HTML +  updateState (\state -> state { stateParseRaw = True }) +  startPos <- getPosition +  -- go through once just to get list of reference keys +  -- docMinusKeys is the raw document with blanks where the keys were... +  docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>=  +                  return . concat +  setInput docMinusKeys +  setPosition startPos +  st <- getState +  -- go through again for notes unless strict... +  if stateStrict st +     then return () +     else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>=  +                              return . concat +             st <- getState +             let reversedNotes = stateNotes st +             updateState $ \st -> st { stateNotes = reverse reversedNotes } +             setInput docMinusNotes +             setPosition startPos +  -- now parse it for real... +  (title, author, date) <- option ([],[],"") titleBlock +  blocks <- parseBlocks  +  return $ Pandoc (Meta title author date) $ filter (/= Null) blocks + +--  +-- initial pass for references and notes +-- + +referenceKey = try $ do +  startPos <- getPosition +  nonindentSpaces +  label <- reference +  char ':' +  skipSpaces +  optional (char '<') +  src <- many (noneOf "> \n\t") +  optional (char '>') +  tit <- option "" referenceTitle +  blanklines +  endPos <- getPosition +  let newkey = (label, (removeTrailingSpace src,  tit)) +  st <- getState +  let oldkeys = stateKeys st +  updateState $ \st -> st { stateKeys = newkey : oldkeys } +  -- return blanks so line count isn't affected +  return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +referenceTitle = try $ do  +  (many1 spaceChar >> option '\n' newline) <|> newline +  skipSpaces +  tit <-    (charsInBalanced '(' ')' >>= return . unwords . words) +        <|> do delim <- char '\'' <|> char '"' +               manyTill anyChar (try (char delim >> skipSpaces >> +                                      notFollowedBy (noneOf ")\n"))) +  return $ decodeCharacterReferences tit + +noteMarker = string "[^" >> manyTill (noneOf " \t\n") (char ']') + +rawLine = do +  notFollowedBy blankline +  notFollowedBy' noteMarker +  contents <- many1 nonEndline +  end <- option "" (newline >> optional indentSpaces >> return "\n")  +  return $ contents ++ end + +rawLines = many1 rawLine >>= return . concat + +noteBlock = try $ do +  startPos <- getPosition +  ref <- noteMarker +  char ':' +  optional blankline +  optional indentSpaces +  raw <- sepBy rawLines (try (blankline >> indentSpaces)) +  optional blanklines +  endPos <- getPosition +  -- parse the extracted text, which may contain various block elements: +  contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n" +  let newnote = (ref, contents) +  st <- getState +  let oldnotes = stateNotes st +  updateState $ \st -> st { stateNotes = newnote : oldnotes } +  -- return blanks so line count isn't affected +  return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +-- +-- parsing blocks +-- + +parseBlocks = manyTill block eof + +block = choice [ header  +               , table +               , codeBlock +               , hrule +               , list +               , blockQuote +               , htmlBlock +               , rawLaTeXEnvironment' +               , para +               , plain +               , nullBlock ] <?> "block" + +-- +-- header blocks +-- + +header = atxHeader <|> setextHeader <?> "header" + +atxHeader = try $ do +  level <- many1 (char '#') >>= return . length +  notFollowedBy (char '.' <|> char ')') -- this would be a list +  skipSpaces +  text <- manyTill inline atxClosing >>= return . normalizeSpaces +  return $ Header level text + +atxClosing = try $ skipMany (char '#') >> blanklines + +setextHeader = try $ do +  -- first, see if this block has any chance of being a setextHeader: +  lookAhead (anyLine >> oneOf setextHChars) +  text <- many1Till inline newline >>= return . normalizeSpaces +  level <- choice $ zipWith  +           (\ch lev -> try (many1 $ char ch) >> blanklines >> return lev) +           setextHChars [1..(length setextHChars)] +  return $ Header level text + +-- +-- hrule block +-- + +hrule = try $ do +  skipSpaces +  start <- oneOf hruleChars +  count 2 (skipSpaces >> char start) +  skipMany (skipSpaces >> char start) +  newline +  optional blanklines +  return HorizontalRule + +-- +-- code blocks +-- + +indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n") + +codeBlock = do +  contents <- many1 (indentedLine <|>  +                     try (do b <- blanklines +                             l <- indentedLine +                             return $ b ++ l)) +  optional blanklines +  return $ CodeBlock $ stripTrailingNewlines $ concat contents + +-- +-- block quotes +-- + +emacsBoxQuote = try $ do +  failIfStrict +  string ",----" +  manyTill anyChar newline +  raw <- manyTill  +         (try (char '|' >> optional (char ' ') >> manyTill anyChar newline)) +         (try (string "`----")) +  blanklines +  return raw + +emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ') + +emailBlockQuote = try $ do +  emailBlockQuoteStart +  raw <- sepBy (many (nonEndline <|>  +                      (try (endline >> notFollowedBy emailBlockQuoteStart >> +                       return '\n')))) +               (try (newline >> emailBlockQuoteStart)) +  newline <|> (eof >> return '\n') +  optional blanklines +  return raw + +blockQuote = do  +  raw <- emailBlockQuote <|> emacsBoxQuote +  -- parse the extracted block, which may contain various block elements: +  contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n" +  return $ BlockQuote contents +  +-- +-- list blocks +-- + +list = choice [ bulletList, orderedList, definitionList ] <?> "list" + +bulletListStart = try $ do +  optional newline -- if preceded by a Plain block in a list context +  nonindentSpaces +  notFollowedBy' hrule     -- because hrules start out just like lists +  oneOf bulletListMarkers +  spaceChar +  skipSpaces + +anyOrderedListStart = try $ do +  optional newline -- if preceded by a Plain block in a list context +  nonindentSpaces +  notFollowedBy $ string "p." >> spaceChar >> digit  -- page number +  state <- getState +  if stateStrict state +     then do many1 digit +             char '.' +             spaceChar +             return (1, DefaultStyle, DefaultDelim) +     else anyOrderedListMarker >>~ spaceChar + +orderedListStart style delim = try $ do +  optional newline -- if preceded by a Plain block in a list context +  nonindentSpaces +  state <- getState +  num <- if stateStrict state +            then do many1 digit +                    char '.' +                    return 1 +            else orderedListMarker style delim  +  if delim == Period && (style == UpperAlpha || (style == UpperRoman && +     num `elem` [1, 5, 10, 50, 100, 500, 1000])) +     then char '\t' <|> (spaceChar >> spaceChar) +     else spaceChar +  skipSpaces + +-- parse a line of a list item (start = parser for beginning of list item) +listLine start = try $ do +  notFollowedBy' start +  notFollowedBy blankline +  notFollowedBy' (do indentSpaces +                     many (spaceChar) +                     bulletListStart <|> (anyOrderedListStart >> return ())) +  line <- manyTill anyChar newline +  return $ line ++ "\n" + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem start = try $ do +  start +  result <- many1 (listLine start) +  blanks <- many blankline +  return $ concat result ++ blanks + +-- continuation of a list item - indented and separated by blankline  +-- or (in compact lists) endline. +-- note: nested lists are parsed as continuations +listContinuation start = try $ do +  lookAhead indentSpaces +  result <- many1 (listContinuationLine start) +  blanks <- many blankline +  return $ concat result ++ blanks + +listContinuationLine start = try $ do +  notFollowedBy blankline +  notFollowedBy' start +  optional indentSpaces +  result <- manyTill anyChar newline +  return $ result ++ "\n" + +listItem start = try $ do  +  first <- rawListItem start +  continuations <- many (listContinuation start) +  -- parsing with ListItemState forces markers at beginning of lines to +  -- count as list item markers, even if not separated by blank space. +  -- see definition of "endline" +  state <- getState +  let oldContext = stateParserContext state +  setState $ state {stateParserContext = ListItemState} +  -- parse the extracted block, which may contain various block elements: +  let raw = concat (first:continuations) +  contents <- parseFromString parseBlocks raw +  updateState (\st -> st {stateParserContext = oldContext}) +  return contents + +orderedList = try $ do +  (start, style, delim) <- lookAhead anyOrderedListStart +  items <- many1 (listItem (orderedListStart style delim)) +  return $ OrderedList (start, style, delim) $ compactify items + +bulletList = many1 (listItem bulletListStart) >>=  +             return . BulletList . compactify + +-- definition lists + +definitionListItem = try $ do +  notFollowedBy blankline +  notFollowedBy' indentSpaces +  -- first, see if this has any chance of being a definition list: +  lookAhead (anyLine >> char ':') +  term <- manyTill inline newline +  raw <- many1 defRawBlock +  state <- getState +  let oldContext = stateParserContext state +  -- parse the extracted block, which may contain various block elements: +  contents <- parseFromString parseBlocks $ concat raw +  updateState (\st -> st {stateParserContext = oldContext}) +  return ((normalizeSpaces term), contents) + +defRawBlock = try $ do +  char ':' +  state <- getState +  let tabStop = stateTabStop state +  try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t") +  firstline <- anyLine +  rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine) +  trailing <- option "" blanklines +  return $ firstline ++ "\n" ++ unlines rawlines ++ trailing + +definitionList = do +  failIfStrict +  items <- many1 definitionListItem +  let (terms, defs) = unzip items +  let defs' = compactify defs +  let items' = zip terms defs' +  return $ DefinitionList items' + +-- +-- paragraph block +-- + +para = try $ do  +  result <- many1 inline +  newline +  blanklines <|> do st <- getState +                    if stateStrict st +                       then lookAhead (blockQuote <|> header) >> return "" +                       else lookAhead emacsBoxQuote >> return "" +  return $ Para $ normalizeSpaces result + +plain = many1 inline >>= return . Plain . normalizeSpaces  + +--  +-- raw html +-- + +htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element" + +htmlBlock = do +  st <- getState +  if stateStrict st +    then try $ do failUnlessBeginningOfLine +                  first <- htmlElement +                  finalSpace <- many (oneOf spaceChars) +                  finalNewlines <- many newline +                  return $ RawHtml $ first ++ finalSpace ++ finalNewlines +    else rawHtmlBlocks + +-- True if tag is self-closing +isSelfClosing tag =  +  isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag + +strictHtmlBlock = try $ do +  tag <- anyHtmlBlockTag  +  let tag' = extractTagType tag +  if isSelfClosing tag || tag' == "hr"  +     then return tag +     else do contents <- many (notFollowedBy' (htmlEndTag tag') >>  +                               (htmlElement <|> (count 1 anyChar))) +             end <- htmlEndTag tag' +             return $ tag ++ concat contents ++ end + +rawHtmlBlocks = do +  htmlBlocks <- many1 rawHtmlBlock     +  let combined = concatMap (\(RawHtml str) -> str) htmlBlocks +  let combined' = if not (null combined) && last combined == '\n' +                     then init combined  -- strip extra newline  +                     else combined  +  return $ RawHtml combined' + +-- +-- LaTeX +-- + +rawLaTeXEnvironment' = failIfStrict >> rawLaTeXEnvironment + +-- +-- Tables +--  + +-- Parse a dashed line with optional trailing spaces; return its length +-- and the length including trailing space. +dashedLine ch = do +  dashes <- many1 (char ch) +  sp     <- many spaceChar +  return $ (length dashes, length $ dashes ++ sp) + +-- Parse a table header with dashed lines of '-' preceded by  +-- one line of text. +simpleTableHeader = try $ do +  rawContent  <- anyLine +  initSp      <- nonindentSpaces +  dashes      <- many1 (dashedLine '-') +  newline +  let (lengths, lines) = unzip dashes +  let indices  = scanl (+) (length initSp) lines +  let rawHeads = tail $ splitByIndices (init indices) rawContent +  let aligns   = zipWith alignType (map (\a -> [a]) rawHeads) lengths +  return (rawHeads, aligns, indices) + +-- Parse a table footer - dashed lines followed by blank line. +tableFooter = try $ nonindentSpaces >> many1 (dashedLine '-') >> blanklines + +-- Parse a table separator - dashed line. +tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n" + +-- Parse a raw line and split it into chunks by indices. +rawTableLine indices = do +  notFollowedBy' (blanklines <|> tableFooter) +  line <- many1Till anyChar newline +  return $ map removeLeadingTrailingSpace $ tail $  +           splitByIndices (init indices) line + +-- Parse a table line and return a list of lists of blocks (columns). +tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain)) + +-- Parse a multiline table row and return a list of blocks (columns). +multilineRow indices = do +  colLines <- many1 (rawTableLine indices) +  optional blanklines +  let cols = map unlines $ transpose colLines +  mapM (parseFromString (many plain)) cols + +-- Calculate relative widths of table columns, based on indices +widthsFromIndices :: Int     -- Number of columns on terminal +                  -> [Int]   -- Indices +                  -> [Float] -- Fractional relative sizes of columns +widthsFromIndices _ [] = []   +widthsFromIndices numColumns indices =  +  let lengths = zipWith (-) indices (0:indices) +      totLength = sum lengths +      quotient = if totLength > numColumns +                   then fromIntegral totLength +                   else fromIntegral numColumns +      fracs = map (\l -> (fromIntegral l) / quotient) lengths in +  tail fracs + +-- Parses a table caption:  inlines beginning with 'Table:' +-- and followed by blank lines. +tableCaption = try $ do +  nonindentSpaces +  string "Table:" +  result <- many1 inline +  blanklines +  return $ normalizeSpaces result + +-- Parse a table using 'headerParser', 'lineParser', and 'footerParser'. +tableWith headerParser lineParser footerParser = try $ do +    (rawHeads, aligns, indices) <- headerParser +    lines <- many1Till (lineParser indices) footerParser +    caption <- option [] tableCaption +    heads <- mapM (parseFromString (many plain)) rawHeads +    state <- getState +    let numColumns = stateColumns state +    let widths = widthsFromIndices numColumns indices +    return $ Table caption aligns widths heads lines + +-- Parse a simple table with '---' header and one line per row. +simpleTable = tableWith simpleTableHeader tableLine blanklines + +-- Parse a multiline table:  starts with row of '-' on top, then header +-- (which may be multiline), then the rows, +-- which may be multiline, separated by blank lines, and +-- ending with a footer (dashed line followed by blank line). +multilineTable = tableWith multilineTableHeader multilineRow tableFooter + +multilineTableHeader = try $ do +  tableSep  +  rawContent  <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline) +  initSp      <- nonindentSpaces +  dashes      <- many1 (dashedLine '-') +  newline +  let (lengths, lines) = unzip dashes +  let indices  = scanl (+) (length initSp) lines +  let rawHeadsList = transpose $ map  +                     (\ln -> tail $ splitByIndices (init indices) ln) +                     rawContent +  let rawHeads = map (joinWithSep " ") rawHeadsList +  let aligns   = zipWith alignType rawHeadsList lengths +  return ((map removeLeadingTrailingSpace rawHeads), aligns, indices) + +-- Returns an alignment type for a table, based on a list of strings +-- (the rows of the column header) and a number (the length of the +-- dashed line under the rows. +alignType :: [String] -> Int -> Alignment +alignType []  len = AlignDefault +alignType strLst len = +  let str        = head $ sortBy (comparing length) $  +                          map removeTrailingSpace strLst +      leftSpace  = if null str then False else (str !! 0) `elem` " \t" +      rightSpace = length str < len || (str !! (len - 1)) `elem` " \t" +  in  case (leftSpace, rightSpace) of +        (True,  False)   -> AlignRight +        (False, True)    -> AlignLeft +        (True,  True)    -> AlignCenter +        (False, False)   -> AlignDefault + +table = failIfStrict >> (simpleTable <|> multilineTable) <?> "table" + +--  +-- inline +-- + +inline = choice [ str +                , smartPunctuation +                , whitespace +                , endline +                , code +                , charRef +                , strong +                , emph +                , note +                , inlineNote +                , link +                , image +                , math +                , strikeout +                , superscript +                , subscript +                , autoLink +                , rawHtmlInline' +                , rawLaTeXInline' +                , escapedChar +                , symbol +                , ltSign ] <?> "inline" + +escapedChar = do +  char '\\' +  state <- getState +  result <- option '\\' $ if stateStrict state  +                             then oneOf "\\`*_{}[]()>#+-.!~" +                             else satisfy (not . isAlphaNum) +  return $ Str [result] + +ltSign = do +  st <- getState +  if stateStrict st +     then char '<' +     else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html +  return $ Str ['<'] + +specialCharsMinusLt = filter (/= '<') specialChars + +symbol = do  +  result <- oneOf specialCharsMinusLt +  return $ Str [result] + +-- parses inline code, between n `s and n `s +code = try $ do  +  starts <- many1 (char '`') +  skipSpaces +  result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|> +                       (char '\n' >> return " "))  +                      (try (skipSpaces >> count (length starts) (char '`') >>  +                      notFollowedBy (char '`'))) +  return $ Code $ removeLeadingTrailingSpace $ concat result + +mathWord = many1 ((noneOf " \t\n\\$") <|> +                  (try (char '\\') >>~ notFollowedBy (char '$'))) + +math = try $ do +  failIfStrict +  char '$' +  notFollowedBy space +  words <- sepBy1 mathWord (many1 space) +  char '$' +  return $ TeX ("$" ++ (joinWithSep " " words) ++ "$") + +emph = ((enclosed (char '*') (char '*') inline) <|> +        (enclosed (char '_') (char '_' >> notFollowedBy alphaNum) inline)) >>=  +        return . Emph . normalizeSpaces + +strong = ((enclosed (string "**") (try $ string "**") inline) <|>  +          (enclosed (string "__") (try $ string "__") inline)) >>= +         return . Strong . normalizeSpaces + +strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>= +            return . Strikeout . normalizeSpaces + +superscript = failIfStrict >> enclosed (char '^') (char '^')  +              (notFollowedBy' whitespace >> inline) >>= -- may not contain Space +              return . Superscript + +subscript = failIfStrict >> enclosed (char '~') (char '~') +            (notFollowedBy' whitespace >> inline) >>=  -- may not contain Space +            return . Subscript  + +smartPunctuation = failUnlessSmart >>  +                   choice [ quoted, apostrophe, dash, ellipses ] + +apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe + +quoted = doubleQuoted <|> singleQuoted  + +withQuoteContext context parser = do +  oldState <- getState +  let oldQuoteContext = stateQuoteContext oldState +  setState oldState { stateQuoteContext = context } +  result <- parser +  newState <- getState +  setState newState { stateQuoteContext = oldQuoteContext } +  return result + +singleQuoted = try $ do +  singleQuoteStart +  withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>= +    return . Quoted SingleQuote . normalizeSpaces + +doubleQuoted = try $ do  +  doubleQuoteStart +  withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>= +    return . Quoted DoubleQuote . normalizeSpaces + +failIfInQuoteContext context = do +  st <- getState +  if stateQuoteContext st == context +     then fail "already inside quotes" +     else return () + +singleQuoteStart = do  +  failIfInQuoteContext InSingleQuote +  char '\8216' <|>  +     do char '\''   +        notFollowedBy (oneOf ")!],.;:-? \t\n") +        notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> +                            satisfy (not . isAlphaNum))) -- possess/contraction +        return '\'' + +singleQuoteEnd = (char '\'' <|> char '\8217') >> notFollowedBy alphaNum + +doubleQuoteStart = failIfInQuoteContext InDoubleQuote >> +                   (char '"' <|> char '\8220') >> +                   notFollowedBy (oneOf " \t\n") + +doubleQuoteEnd = char '"' <|> char '\8221' + +ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses + +dash = enDash <|> emDash + +enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash + +emDash = try $ skipSpaces >> oneOfStrings ["---", "--"] >> +               skipSpaces >> return EmDash + +whitespace = do +  sps <- many1 (oneOf spaceChars) +  if length sps >= 2 +     then option Space (endline >> return LineBreak) +     else return Space <?> "whitespace" + +nonEndline = satisfy (/='\n') + +strChar = noneOf (specialChars ++ spaceChars ++ "\n") + +str = many1 strChar >>= return . Str + +-- an endline character that can be treated as a space, not a structural break +endline = try $ do +  newline +  notFollowedBy blankline +  st <- getState +  if stateStrict st  +    then do notFollowedBy emailBlockQuoteStart +            notFollowedBy (char '#')  -- atx header +    else return ()  +  -- parse potential list-starts differently if in a list: +  if stateParserContext st == ListItemState +     then notFollowedBy' (bulletListStart <|>  +                          (anyOrderedListStart >> return ())) +     else return () +  return Space + +-- +-- links +-- + +-- a reference label for a link +reference = notFollowedBy' (string "[^") >>  -- footnote reference +            inlinesInBalanced "[" "]" >>= (return . normalizeSpaces) + +-- source for a link, with optional title +source = try $ do  +  char '(' +  optional (char '<') +  src <- many (noneOf ")> \t\n") +  optional (char '>') +  tit <- option "" linkTitle +  skipSpaces +  char ')' +  return (removeTrailingSpace src, tit) + +linkTitle = try $ do  +  (many1 spaceChar >> option '\n' newline) <|> newline +  skipSpaces +  delim <- char '\'' <|> char '"' +  tit <-   manyTill anyChar (try (char delim >> skipSpaces >> +                                  notFollowedBy (noneOf ")\n"))) +  return $ decodeCharacterReferences tit + +link = try $ do +  label <- reference +  src <- source <|> referenceLink label +  return $ Link label src + +-- a link like [this][ref] or [this][] or [this] +referenceLink label = do +  ref <- option [] (try (optional (char ' ') >>  +                         optional (newline >> skipSpaces) >> reference)) +  let ref' = if null ref then label else ref +  state <- getState +  case lookupKeySrc (stateKeys state) ref' of +     Nothing     -> fail "no corresponding key"  +     Just target -> return target  + +emailAddress = try $ do +  name <- many1 (alphaNum <|> char '+') +  char '@' +  first <- many1 alphaNum +  rest <- many1 (char '.' >> many1 alphaNum) +  return $ "mailto:" ++ name ++ "@" ++ joinWithSep "." (first:rest) + +uri = try $ do +  str <- many1 (noneOf "\n\t >") +  if isURI str +     then return str +     else fail "not a URI" + +autoLink = try $ do +  char '<' +  src <- uri <|> emailAddress +  char '>' +  let src' = if "mailto:" `isPrefixOf` src +                then drop 7 src +                else src  +  st <- getState +  return $ if stateStrict st +              then Link [Str src'] (src, "") +              else Link [Code src'] (src, "") + +image = try $ do +  char '!' +  (Link label src) <- link +  return $ Image label src + +note = try $ do +  failIfStrict +  ref <- noteMarker +  state <- getState +  let notes = stateNotes state +  case lookup ref notes of +    Nothing       -> fail "note not found" +    Just contents -> return $ Note contents + +inlineNote = try $ do +  failIfStrict +  char '^' +  contents <- inlinesInBalanced "[" "]" +  return $ Note [Para contents] + +rawLaTeXInline' = failIfStrict >> rawLaTeXInline + +rawHtmlInline' = do +  st <- getState +  result <- choice $ if stateStrict st +                        then [htmlBlockElement, anyHtmlTag, anyHtmlEndTag]  +                        else [htmlBlockElement, anyHtmlInlineTag] +  return $ HtmlInline result + diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs new file mode 100644 index 000000000..1239eb688 --- /dev/null +++ b/src/Text/Pandoc/Readers/RST.hs @@ -0,0 +1,640 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Readers.RST  +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +Conversion from reStructuredText to 'Pandoc' document. +-} +module Text.Pandoc.Readers.RST (  +                                readRST +                               ) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared  +import Text.ParserCombinators.Parsec +import Data.List ( findIndex, delete ) + +-- | Parse reStructuredText string and return Pandoc document. +readRST :: ParserState -> String -> Pandoc +readRST state str = (readWith parseRST) state (str ++ "\n\n") + +-- +-- Constants and data structure definitions +--- + +bulletListMarkers = "*+-" +underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~" + +-- treat these as potentially non-text when parsing inline: +specialChars = "\\`|*_<>$:[-" + +-- +-- parsing documents +-- + +isAnonKey (ref, src) = ref == [Str "_"] + +isHeader :: Int -> Block -> Bool +isHeader n (Header x _) = x == n +isHeader _ _ = False + +-- | Promote all headers in a list of blocks.  (Part of +-- title transformation for RST.) +promoteHeaders :: Int -> [Block] -> [Block] +promoteHeaders num ((Header level text):rest) =  +    (Header (level - num) text):(promoteHeaders num rest) +promoteHeaders num (other:rest) = other:(promoteHeaders num rest) +promoteHeaders num [] = [] + +-- | If list of blocks starts with a header (or a header and subheader) +-- of level that are not found elsewhere, return it as a title and +-- promote all the other headers.  +titleTransform :: [Block]              -- ^ list of blocks +               -> ([Block], [Inline])  -- ^ modified list of blocks, title +titleTransform ((Header 1 head1):(Header 2 head2):rest) =  -- title subtitle +    if (any (isHeader 1) rest) || (any (isHeader 2) rest) +       then ((Header 1 head1):(Header 2 head2):rest, []) +       else ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2) +titleTransform ((Header 1 head1):rest) =       -- title, no subtitle +    if (any (isHeader 1) rest) +       then ((Header 1 head1):rest, []) +       else ((promoteHeaders 1 rest), head1) +titleTransform blocks = (blocks, []) + +parseRST = do +  startPos <- getPosition +  -- go through once just to get list of reference keys +  -- docMinusKeys is the raw document with blanks where the keys were... +  docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= return . concat +  setInput docMinusKeys +  setPosition startPos +  st <- getState +  let reversedKeys = stateKeys st +  updateState $ \st -> st { stateKeys = reverse reversedKeys } +  -- now parse it for real... +  blocks <- parseBlocks  +  let blocks' = filter (/= Null) blocks +  state <- getState +  let (blocks'', title) = if stateStandalone state +                              then titleTransform blocks' +                              else (blocks', []) +  let authors = stateAuthors state +  let date = stateDate state +  let title' = if (null title) then (stateTitle state) else title +  return $ Pandoc (Meta title' authors date) blocks'' + +-- +-- parsing blocks +-- + +parseBlocks = manyTill block eof + +block = choice [ codeBlock +               , rawHtmlBlock +               , rawLaTeXBlock +               , fieldList +               , blockQuote +               , imageBlock +               , unknownDirective +               , header +               , hrule +               , list +               , lineBlock +               , para +               , plain +               , nullBlock ] <?> "block" + +-- +-- field list +-- + +fieldListItem indent = try $ do +  string indent +  char ':' +  name <- many1 alphaNum +  string ": " +  skipSpaces +  first <- manyTill anyChar newline +  rest <- option "" $ try $ lookAhead (string indent >> oneOf " \t") >>  +                            indentedBlock +  return (name, joinWithSep " " (first:(lines rest))) + +fieldList = try $ do +  indent <- lookAhead $ many (oneOf " \t") +  items <- many1 $ fieldListItem indent +  blanklines +  let authors = case lookup "Authors" items of +                  Just auth -> [auth] +                  Nothing  -> map snd (filter (\(x,y) -> x == "Author") items) +  if null authors  +     then return ()  +     else updateState $ \st -> st {stateAuthors = authors} +  case (lookup "Date" items) of +           Just dat -> updateState $ \st -> st {stateDate = dat} +           Nothing  -> return () +  case (lookup "Title" items) of +           Just tit -> parseFromString (many inline) tit >>= +                       \t -> updateState $ \st -> st {stateTitle = t} +           Nothing  -> return () +  let remaining = filter (\(x,y) -> (x /= "Authors") && (x /= "Author") &&  +                  (x /= "Date") && (x /= "Title")) items +  if null remaining +              then return Null +              else do terms <- mapM (return . (:[]) . Str . fst) remaining +                      defs  <- mapM (parseFromString (many block) . snd)  +                                    remaining +                      return $ DefinitionList $ zip terms defs + +-- +-- line block +-- + +lineBlockLine = try $ do +  string "| " +  white <- many (oneOf " \t") +  line <- manyTill inline newline +  return $ (if null white then [] else [Str white]) ++ line ++ [LineBreak] + +lineBlock = try $ do +  lines <- many1 lineBlockLine +  blanklines +  return $ Para (concat lines) + +-- +-- paragraph block +-- + +para = paraBeforeCodeBlock <|> paraNormal <?> "paragraph" + +codeBlockStart = string "::" >> blankline >> blankline + +-- paragraph that ends in a :: starting a code block +paraBeforeCodeBlock = try $ do +  result <- many1 (notFollowedBy' codeBlockStart >> inline) +  lookAhead (string "::") +  return $ Para $ if last result == Space +                     then normalizeSpaces result +                     else (normalizeSpaces result) ++ [Str ":"] + +-- regular paragraph +paraNormal = try $ do  +  result <- many1 inline +  newline +  blanklines +  return $ Para $ normalizeSpaces result + +plain = many1 inline >>= return . Plain . normalizeSpaces  + +-- +-- image block +-- + +imageBlock = try $ do +  string ".. image:: " +  src <- manyTill anyChar newline +  fields <- option [] $ do indent <- lookAhead $ many (oneOf " /t") +                           many1 $ fieldListItem indent +  optional blanklines +  case lookup "alt" fields of +        Just alt -> return $ Plain [Image [Str alt] (src, alt)] +        Nothing  -> return $ Plain [Image [Str "image"] (src, "")] +-- +-- header blocks +-- + +header = doubleHeader <|> singleHeader <?> "header" + +-- a header with lines on top and bottom +doubleHeader = try $ do +  c <- oneOf underlineChars +  rest <- many (char c)  -- the top line +  let lenTop = length (c:rest) +  skipSpaces +  newline +  txt <- many1 (notFollowedBy blankline >> inline) +  pos <- getPosition +  let len = (sourceColumn pos) - 1 +  if (len > lenTop) then fail "title longer than border" else return () +  blankline              -- spaces and newline +  count lenTop (char c)  -- the bottom line +  blanklines +  -- check to see if we've had this kind of header before.   +  -- if so, get appropriate level.  if not, add to list. +  state <- getState +  let headerTable = stateHeaderTable state +  let (headerTable',level) = case findIndex (== DoubleHeader c) headerTable of +        Just ind -> (headerTable, ind + 1) +        Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1) +  setState (state { stateHeaderTable = headerTable' }) +  return $ Header level (normalizeSpaces txt) + +-- a header with line on the bottom only +singleHeader = try $ do  +  notFollowedBy' whitespace +  txt <- many1 (do {notFollowedBy blankline; inline}) +  pos <- getPosition +  let len = (sourceColumn pos) - 1 +  blankline +  c <- oneOf underlineChars +  rest <- count (len - 1) (char c) +  many (char c) +  blanklines +  state <- getState +  let headerTable = stateHeaderTable state +  let (headerTable',level) = case findIndex (== SingleHeader c) headerTable of +        Just ind -> (headerTable, ind + 1) +        Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1) +  setState (state { stateHeaderTable = headerTable' }) +  return $ Header level (normalizeSpaces txt) + +-- +-- hrule block +-- + +hrule = try $ do +  chr <- oneOf underlineChars +  count 3 (char chr) +  skipMany (char chr) +  blankline +  blanklines +  return HorizontalRule + +-- +-- code blocks +-- + +-- read a line indented by a given string +indentedLine indents = try $ do +  string indents +  result <- manyTill anyChar newline +  return $ result ++ "\n" + +-- two or more indented lines, possibly separated by blank lines. +-- any amount of indentation will work. +indentedBlock = do  +  indents <- lookAhead $ many1 (oneOf " \t") +  lns <- many $ choice $ [ indentedLine indents, +                           try $ do b <- blanklines +                                    l <- indentedLine indents +                                    return (b ++ l) ] +  optional blanklines  +  return $ concat lns + +codeBlock = try $ do +  codeBlockStart +  result <- indentedBlock +  return $ CodeBlock $ stripTrailingNewlines result + +-- +-- raw html +-- + +rawHtmlBlock = try $ string ".. raw:: html" >> blanklines >> +                     indentedBlock >>= return . RawHtml + +-- +-- raw latex +-- + +rawLaTeXBlock = try $ do +  string ".. raw:: latex" +  blanklines +  result <- indentedBlock +  return $ Para [(TeX result)] + +-- +-- block quotes +-- + +blockQuote = do +  raw <- indentedBlock +  -- parse the extracted block, which may contain various block elements: +  contents <- parseFromString parseBlocks $ raw ++ "\n\n" +  return $ BlockQuote contents + +-- +-- list blocks +-- + +list = choice [ bulletList, orderedList, definitionList ] <?> "list" + +definitionListItem = try $ do +  term <- many1Till inline endline +  raw <- indentedBlock +  -- parse the extracted block, which may contain various block elements: +  contents <- parseFromString parseBlocks $ raw ++ "\n\n" +  return (normalizeSpaces term, contents) + +definitionList = many1 definitionListItem >>= return . DefinitionList + +-- parses bullet list start and returns its length (inc. following whitespace) +bulletListStart = try $ do +  notFollowedBy' hrule  -- because hrules start out just like lists +  marker <- oneOf bulletListMarkers +  white <- many1 spaceChar +  return $ length (marker:white) + +-- parses ordered list start and returns its length (inc following whitespace) +orderedListStart style delim = try $ do +  (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim) +  white <- many1 spaceChar +  return $ markerLen + length white + +-- parse a line of a list item +listLine markerLength = try $ do +  notFollowedBy blankline +  indentWith markerLength +  line <- manyTill anyChar newline +  return $ line ++ "\n" + +-- indent by specified number of spaces (or equiv. tabs) +indentWith num = do +  state <- getState +  let tabStop = stateTabStop state +  if (num < tabStop) +     then count num  (char ' ') +     else choice [ try (count num (char ' ')),  +                   (try (char '\t' >> count (num - tabStop) (char ' '))) ]  + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem start = do +  markerLength <- start +  firstLine <- manyTill anyChar newline +  restLines <- many (listLine markerLength) +  return (markerLength, (firstLine ++ "\n" ++ (concat restLines))) + +-- continuation of a list item - indented and separated by blankline or  +-- (in compact lists) endline.   +-- Note: nested lists are parsed as continuations. +listContinuation markerLength = try $ do +  blanks <- many1 blankline +  result <- many1 (listLine markerLength) +  return $ blanks ++ concat result + +listItem start = try $ do  +  (markerLength, first) <- rawListItem start +  rest <- many (listContinuation markerLength) +  blanks <- choice [ try (many blankline >>~ lookAhead start), +                     many1 blankline ]  -- whole list must end with blank. +  -- parsing with ListItemState forces markers at beginning of lines to +  -- count as list item markers, even if not separated by blank space. +  -- see definition of "endline" +  state <- getState +  let oldContext = stateParserContext state +  setState $ state {stateParserContext = ListItemState} +  -- parse the extracted block, which may itself contain block elements +  parsed <- parseFromString parseBlocks $ concat (first:rest) ++ blanks +  updateState (\st -> st {stateParserContext = oldContext}) +  return parsed + +orderedList = do +  (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar) +  items <- many1 (listItem (orderedListStart style delim)) +  let items' = compactify items +  return $ OrderedList (start, style, delim) items' + +bulletList = many1 (listItem bulletListStart) >>=  +             return . BulletList . compactify + +-- +-- unknown directive (e.g. comment) +-- + +unknownDirective = try $ do +  string ".. " +  manyTill anyChar newline +  many (string "   :" >> many1 (noneOf "\n:") >> char ':' >> +        many1 (noneOf "\n") >> newline) +  optional blanklines +  return Null + +--  +-- reference key +-- + +referenceKey = do +  startPos <- getPosition +  key <- choice [imageKey, anonymousKey, regularKeyQuoted, regularKey] +  st <- getState +  let oldkeys = stateKeys st +  updateState $ \st -> st { stateKeys = key : oldkeys } +  optional blanklines +  endPos <- getPosition +  -- return enough blanks to replace key +  return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +targetURI = do +  skipSpaces +  optional newline +  contents <- many1 (try (many spaceChar >> newline >>  +                          many1 spaceChar >> noneOf " \t\n") <|> noneOf "\n") +  blanklines +  return contents + +imageKey = try $ do +  string ".. |" +  ref <- manyTill inline (char '|') +  skipSpaces +  string "image::" +  src <- targetURI +  return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) + +anonymousKey = try $ do +  oneOfStrings [".. __:", "__"] +  src <- targetURI +  state <- getState +  return ([Str "_"], (removeLeadingTrailingSpace src, "")) + +regularKeyQuoted = try $ do +  string ".. _`" +  ref <- manyTill inline (char '`') +  char ':' +  src <- targetURI +  return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) + +regularKey = try $ do +  string ".. _" +  ref <- manyTill inline (char ':') +  src <- targetURI +  return (normalizeSpaces ref, (removeLeadingTrailingSpace src, "")) + + --  + -- inline + -- + +inline = choice [ link +                , str +                , whitespace +                , endline +                , strong +                , emph +                , code +                , image +                , hyphens +                , superscript +                , subscript +                , escapedChar +                , symbol ] <?> "inline" + +hyphens = do +  result <- many1 (char '-') +  option Space endline  +  -- don't want to treat endline after hyphen or dash as a space +  return $ Str result + +escapedChar = escaped anyChar + +symbol = do  +  result <- oneOf specialChars +  return $ Str [result] + +-- parses inline code, between codeStart and codeEnd +code = try $ do  +  string "``" +  result <- manyTill anyChar (try (string "``")) +  return $ Code $ removeLeadingTrailingSpace $ joinWithSep " " $ lines result + +emph = enclosed (char '*') (char '*') inline >>=  +       return . Emph . normalizeSpaces + +strong = enclosed (string "**") (try $ string "**") inline >>=  +         return . Strong . normalizeSpaces + +interpreted role = try $ do +  optional $ try $ string "\\ " +  result <- enclosed (string $ ":" ++ role ++ ":`") (char '`') anyChar +  nextChar <- lookAhead anyChar +  try (string "\\ ") <|> lookAhead (count 1 $ oneOf " \t\n") <|> (eof >> return "") +  return [Str result] + +superscript = interpreted "sup" >>= (return . Superscript) + +subscript = interpreted "sub" >>= (return . Subscript) + +whitespace = many1 spaceChar >> return Space <?> "whitespace" + +str = notFollowedBy' oneWordReference >>  +      many1 (noneOf (specialChars ++ "\t\n ")) >>= return . Str + +-- an endline character that can be treated as a space, not a structural break +endline = try $ do +  newline +  notFollowedBy blankline +  -- parse potential list-starts at beginning of line differently in a list: +  st <- getState +  if (stateParserContext st) == ListItemState +     then notFollowedBy (anyOrderedListMarker >> spaceChar) >> +          notFollowedBy' bulletListStart +     else return () +  return Space + +-- +-- links +-- + +link = choice [explicitLink, referenceLink, autoLink]  <?> "link" + +explicitLink = try $ do +  char '`' +  notFollowedBy (char '`') -- `` is marks start of inline code +  label <- manyTill inline (try (do {spaces; char '<'})) +  src <- manyTill (noneOf ">\n ") (char '>') +  skipSpaces +  string "`_" +  return $ Link (normalizeSpaces label) (removeLeadingTrailingSpace src, "") + +reference = try $ do +  char '`' +  notFollowedBy (char '`') +  label <- many1Till inline (char '`')  +  char '_' +  return label + +oneWordReference = do +  raw <- many1 alphaNum +  char '_' +  notFollowedBy alphaNum    -- because this_is_not a link +  return [Str raw] + +referenceLink = try $ do +  label <- reference <|> oneWordReference +  key <- option label (do{char '_'; return [Str "_"]}) -- anonymous link +  state <- getState +  let keyTable = stateKeys state +  src <- case lookupKeySrc keyTable key of +           Nothing     -> fail "no corresponding key" +           Just target -> return target +  -- if anonymous link, remove first anon key so it won't be used again +  let keyTable' = if (key == [Str "_"]) -- anonymous link?  +                    then delete ([Str "_"], src) keyTable -- remove first anon key  +                    else keyTable                     +  setState $ state { stateKeys = keyTable' } +  return $ Link (normalizeSpaces label) src  + +uriScheme = oneOfStrings [ "http://", "https://", "ftp://", "file://",  +                           "mailto:", "news:", "telnet:" ] + +uri = try $ do +  scheme <- uriScheme +  identifier <- many1 (noneOf " \t\n") +  return $ scheme ++ identifier + +autoURI = do +  src <- uri +  return $ Link [Str src] (src, "") + +emailChar = alphaNum <|> oneOf "-+_." + +emailAddress = try $ do +  firstLetter <- alphaNum +  restAddr <- many emailChar +  let addr = firstLetter:restAddr +  char '@' +  dom <- domain +  return $ addr ++ '@':dom + +domainChar = alphaNum <|> char '-' + +domain = do +  first <- many1 domainChar +  dom <- many1 (try (do{ char '.'; many1 domainChar })) +  return $ joinWithSep "." (first:dom) + +autoEmail = do +  src <- emailAddress +  return $ Link [Str src] ("mailto:" ++ src, "") + +autoLink = autoURI <|> autoEmail + +-- For now, we assume that all substitution references are for images. +image = try $ do +  char '|' +  ref <- manyTill inline (char '|') +  state <- getState +  let keyTable = stateKeys state +  src <- case lookupKeySrc keyTable ref of +           Nothing     -> fail "no corresponding key" +           Just target -> return target +  return $ Image (normalizeSpaces ref) src + diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs new file mode 100644 index 000000000..f27c3ae75 --- /dev/null +++ b/src/Text/Pandoc/Shared.hs @@ -0,0 +1,792 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Shared +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +Utility functions and definitions used by the various Pandoc modules. +-} +module Text.Pandoc.Shared (  +                     -- * List processing +                     splitBy, +                     splitByIndices, +                     substitute, +                     joinWithSep, +                     -- * Text processing +                     backslashEscapes, +                     escapeStringUsing, +                     stripTrailingNewlines, +                     removeLeadingTrailingSpace, +                     removeLeadingSpace, +                     removeTrailingSpace, +                     stripFirstAndLast, +                     camelCaseToHyphenated, +                     toRomanNumeral, +                     wrapped, +                     wrapIfNeeded, +                     -- * Parsing +                     (>>~), +                     anyLine, +                     many1Till, +                     notFollowedBy', +                     oneOfStrings, +                     spaceChar, +                     skipSpaces, +                     blankline, +                     blanklines, +                     enclosed, +                     stringAnyCase, +                     parseFromString, +                     lineClump, +                     charsInBalanced, +                     charsInBalanced', +                     romanNumeral, +                     withHorizDisplacement, +                     nullBlock, +                     failIfStrict, +                     escaped, +                     anyOrderedListMarker, +                     orderedListMarker, +                     charRef, +                     readWith, +                     testStringWith, +                     ParserState (..), +                     defaultParserState, +                     HeaderType (..), +                     ParserContext (..), +                     QuoteContext (..), +                     NoteTable, +                     KeyTable, +                     lookupKeySrc, +                     refsMatch, +                     -- * Native format prettyprinting +                     prettyPandoc, +                     -- * Pandoc block and inline list processing +                     orderedListMarkers, +                     normalizeSpaces, +                     compactify, +                     Element (..), +                     hierarchicalize, +                     isHeaderBlock, +                     -- * Writer options +                     WriterOptions (..), +                     defaultWriterOptions +                    ) where + +import Text.Pandoc.Definition +import Text.ParserCombinators.Parsec +import Text.PrettyPrint.HughesPJ ( Doc, fsep ) +import Text.Pandoc.CharacterReferences ( characterReference ) +import Data.Char ( toLower, toUpper, ord, isLower, isUpper ) +import Data.List ( find, isPrefixOf ) +import Control.Monad ( join ) + +-- +-- List processing +-- + +-- | Split list by groups of one or more sep. +splitBy :: (Eq a) => a -> [a] -> [[a]] +splitBy _ [] = [] +splitBy sep lst =  +  let (first, rest) = break (== sep) lst +      rest'         = dropWhile (== sep) rest +  in  first:(splitBy sep rest') + +-- | Split list into chunks divided at specified indices. +splitByIndices :: [Int] -> [a] -> [[a]] +splitByIndices [] lst = [lst] +splitByIndices (x:xs) lst = +    let (first, rest) = splitAt x lst in +    first:(splitByIndices (map (\y -> y - x)  xs) rest) + +-- | Replace each occurrence of one sublist in a list with another. +substitute :: (Eq a) => [a] -> [a] -> [a] -> [a] +substitute _ _ [] = [] +substitute [] _ lst = lst +substitute target replacement lst =  +    if target `isPrefixOf` lst +       then replacement ++ (substitute target replacement $ drop (length target) lst) +       else (head lst):(substitute target replacement $ tail lst) + +-- | Joins a list of lists, separated by another list. +joinWithSep :: [a]    -- ^ List to use as separator +            -> [[a]]  -- ^ Lists to join +            -> [a] +joinWithSep _ [] = [] +joinWithSep sep lst = foldr1 (\a b -> a ++ sep ++ b) lst + +-- +-- Text processing +-- + +-- | Returns an association list of backslash escapes for the +-- designated characters. +backslashEscapes :: [Char]    -- ^ list of special characters to escape +                 -> [(Char, String)] +backslashEscapes = map (\ch -> (ch, ['\\',ch])) + +-- | Escape a string of characters, using an association list of +-- characters and strings. +escapeStringUsing :: [(Char, String)] -> String -> String +escapeStringUsing _ [] = "" +escapeStringUsing escapeTable (x:xs) =  +  case (lookup x escapeTable) of +       Just str  -> str ++ rest +       Nothing   -> x:rest +  where rest = escapeStringUsing escapeTable xs + +-- | Strip trailing newlines from string. +stripTrailingNewlines :: String -> String +stripTrailingNewlines = reverse . dropWhile (== '\n') . reverse + +-- | Remove leading and trailing space (including newlines) from string. +removeLeadingTrailingSpace :: String -> String +removeLeadingTrailingSpace = removeLeadingSpace . removeTrailingSpace + +-- | Remove leading space (including newlines) from string. +removeLeadingSpace :: String -> String +removeLeadingSpace = dropWhile (`elem` " \n\t") + +-- | Remove trailing space (including newlines) from string. +removeTrailingSpace :: String -> String +removeTrailingSpace = reverse . removeLeadingSpace . reverse + +-- | Strip leading and trailing characters from string +stripFirstAndLast :: String -> String +stripFirstAndLast str = +  drop 1 $ take ((length str) - 1) str + +-- | Change CamelCase word to hyphenated lowercase (e.g., camel-case).  +camelCaseToHyphenated :: String -> String +camelCaseToHyphenated [] = "" +camelCaseToHyphenated (a:b:rest) | isLower a && isUpper b = +  a:'-':(toLower b):(camelCaseToHyphenated rest) +camelCaseToHyphenated (a:rest) = (toLower a):(camelCaseToHyphenated rest) + +-- | Convert number < 4000 to uppercase roman numeral. +toRomanNumeral :: Int -> String +toRomanNumeral x = +  if x >= 4000 || x < 0 +     then "?" +     else case x of +              _ | x >= 1000 -> "M" ++ toRomanNumeral (x - 1000) +              _ | x >= 900  -> "CM" ++ toRomanNumeral (x - 900) +              _ | x >= 500  -> "D" ++ toRomanNumeral (x - 500) +              _ | x >= 400  -> "CD" ++ toRomanNumeral (x - 400) +              _ | x >= 100  -> "C" ++ toRomanNumeral (x - 100) +              _ | x >= 90   -> "XC" ++ toRomanNumeral (x - 90) +              _ | x >= 50   -> "L"  ++ toRomanNumeral (x - 50) +              _ | x >= 40   -> "XL" ++ toRomanNumeral (x - 40) +              _ | x >= 10   -> "X" ++ toRomanNumeral (x - 10) +              _ | x >= 9    -> "IX" ++ toRomanNumeral (x - 5) +              _ | x >= 5    -> "V" ++ toRomanNumeral (x - 5) +              _ | x >= 4    -> "IV" ++ toRomanNumeral (x - 4) +              _ | x >= 1    -> "I" ++ toRomanNumeral (x - 1) +              _             -> "" + +-- | Wrap inlines to line length. +wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc +wrapped listWriter sect = (mapM listWriter $ splitBy Space sect) >>=  +                          return . fsep + +wrapIfNeeded :: Monad m => WriterOptions -> ([Inline] -> m Doc) ->  +                           [Inline] -> m Doc +wrapIfNeeded opts = if writerWrapText opts +                       then wrapped  +                       else ($) + +-- +-- Parsing +-- + +-- | Like >>, but returns the operation on the left. +-- (Suggested by Tillmann Rendel on Haskell-cafe list.) +(>>~) :: (Monad m) => m a -> m b -> m a +a >>~ b = a >>= \x -> b >> return x + +-- | Parse any line of text +anyLine :: GenParser Char st [Char] +anyLine = manyTill anyChar newline + +-- | Like @manyTill@, but reads at least one item. +many1Till :: GenParser tok st a +	     -> GenParser tok st end +	     -> GenParser tok st [a] +many1Till p end = do +         first <- p +         rest <- manyTill p end +         return (first:rest) + +-- | A more general form of @notFollowedBy@.  This one allows any  +-- type of parser to be specified, and succeeds only if that parser fails. +-- It does not consume any input. +notFollowedBy' :: Show b => GenParser a st b -> GenParser a st () +notFollowedBy' p  = try $ join $  do  a <- try p +                                      return (unexpected (show a)) +                                  <|> +                                  return (return ()) +-- (This version due to Andrew Pimlott on the Haskell mailing list.) + +-- | Parses one of a list of strings (tried in order).   +oneOfStrings :: [String] -> GenParser Char st String +oneOfStrings listOfStrings = choice $ map (try . string) listOfStrings + +-- | Parses a space or tab. +spaceChar :: CharParser st Char +spaceChar = char ' ' <|> char '\t' + +-- | Skips zero or more spaces or tabs. +skipSpaces :: GenParser Char st () +skipSpaces = skipMany spaceChar + +-- | Skips zero or more spaces or tabs, then reads a newline. +blankline :: GenParser Char st Char +blankline = try $ skipSpaces >> newline + +-- | Parses one or more blank lines and returns a string of newlines. +blanklines :: GenParser Char st [Char] +blanklines = many1 blankline + +-- | Parses material enclosed between start and end parsers. +enclosed :: GenParser Char st t   -- ^ start parser +	    -> GenParser Char st end  -- ^ end parser +	    -> GenParser Char st a    -- ^ content parser (to be used repeatedly) +	    -> GenParser Char st [a] +enclosed start end parser = try $  +  start >> notFollowedBy space >> many1Till parser end + +-- | Parse string, case insensitive. +stringAnyCase :: [Char] -> CharParser st String +stringAnyCase [] = string "" +stringAnyCase (x:xs) = do +  firstChar <- char (toUpper x) <|> char (toLower x) +  rest <- stringAnyCase xs +  return (firstChar:rest) + +-- | Parse contents of 'str' using 'parser' and return result. +parseFromString :: GenParser tok st a -> [tok] -> GenParser tok st a +parseFromString parser str = do +  oldPos <- getPosition +  oldInput <- getInput +  setInput str +  result <- parser +  setInput oldInput +  setPosition oldPos +  return result + +-- | Parse raw line block up to and including blank lines. +lineClump :: GenParser Char st String +lineClump = blanklines  +          <|> (many1 (notFollowedBy blankline >> anyLine) >>= return . unlines) + +-- | Parse a string of characters between an open character +-- and a close character, including text between balanced +-- pairs of open and close, which must be different. For example, +-- @charsInBalanced '(' ')'@ will parse "(hello (there))" +-- and return "hello (there)".  Stop if a blank line is +-- encountered. +charsInBalanced :: Char -> Char -> GenParser Char st String +charsInBalanced open close = try $ do +  char open +  raw <- many $     (many1 (noneOf [open, close, '\n'])) +                <|> (do res <- charsInBalanced open close +                        return $ [open] ++ res ++ [close]) +                <|> try (string "\n" >>~ notFollowedBy' blanklines) +  char close +  return $ concat raw + +-- | Like @charsInBalanced@, but allow blank lines in the content. +charsInBalanced' :: Char -> Char -> GenParser Char st String +charsInBalanced' open close = try $ do +  char open +  raw <- many $       (many1 (noneOf [open, close])) +                  <|> (do res <- charsInBalanced' open close +                          return $ [open] ++ res ++ [close]) +  char close +  return $ concat raw + +-- | Parses a roman numeral (uppercase or lowercase), returns number. +romanNumeral :: Bool                  -- ^ Uppercase if true +             -> GenParser Char st Int +romanNumeral upperCase = do +    let charAnyCase c = char (if upperCase then toUpper c else c) +    let one = charAnyCase 'i' +    let five = charAnyCase 'v' +    let ten = charAnyCase 'x' +    let fifty = charAnyCase 'l' +    let hundred = charAnyCase 'c' +    let fivehundred = charAnyCase 'd' +    let thousand = charAnyCase 'm' +    thousands <- many thousand >>= (return . (1000 *) . length) +    ninehundreds <- option 0 $ try $ hundred >> thousand >> return 900 +    fivehundreds <- many fivehundred >>= (return . (500 *) . length) +    fourhundreds <- option 0 $ try $ hundred >> fivehundred >> return 400 +    hundreds <- many hundred >>= (return . (100 *) . length) +    nineties <- option 0 $ try $ ten >> hundred >> return 90 +    fifties <- many fifty >>= (return . (50 *) . length) +    forties <- option 0 $ try $ ten >> fifty >> return 40 +    tens <- many ten >>= (return . (10 *) . length) +    nines <- option 0 $ try $ one >> ten >> return 9 +    fives <- many five >>= (return . (5 *) . length) +    fours <- option 0 $ try $ one >> five >> return 4 +    ones <- many one >>= (return . length) +    let total = thousands + ninehundreds + fivehundreds + fourhundreds + +                hundreds + nineties + fifties + forties + tens + nines + +                fives + fours + ones +    if total == 0 +       then fail "not a roman numeral" +       else return total + +-- | Applies a parser, returns tuple of its results and its horizontal +-- displacement (the difference between the source column at the end +-- and the source column at the beginning). Vertical displacement +-- (source row) is ignored. +withHorizDisplacement :: GenParser Char st a  -- ^ Parser to apply +                      -> GenParser Char st (a, Int) -- ^ (result, displacement) +withHorizDisplacement parser = do +  pos1 <- getPosition +  result <- parser +  pos2 <- getPosition +  return (result, sourceColumn pos2 - sourceColumn pos1) + +-- | Parses a character and returns 'Null' (so that the parser can move on +-- if it gets stuck). +nullBlock :: GenParser Char st Block +nullBlock = anyChar >> return Null + +-- | Fail if reader is in strict markdown syntax mode. +failIfStrict :: GenParser Char ParserState () +failIfStrict = do +  state <- getState +  if stateStrict state then fail "strict mode" else return () + +-- | Parses backslash, then applies character parser. +escaped :: GenParser Char st Char  -- ^ Parser for character to escape +        -> GenParser Char st Inline +escaped parser = try $ do +  char '\\' +  result <- parser +  return (Str [result]) + +-- | Parses an uppercase roman numeral and returns (UpperRoman, number). +upperRoman :: GenParser Char st (ListNumberStyle, Int) +upperRoman = do +  num <- romanNumeral True +  return (UpperRoman, num) + +-- | Parses a lowercase roman numeral and returns (LowerRoman, number). +lowerRoman :: GenParser Char st (ListNumberStyle, Int) +lowerRoman = do +  num <- romanNumeral False +  return (LowerRoman, num) + +-- | Parses a decimal numeral and returns (Decimal, number). +decimal :: GenParser Char st (ListNumberStyle, Int) +decimal = do +  num <- many1 digit +  return (Decimal, read num) + +-- | Parses a '#' returns (DefaultStyle, 1). +defaultNum :: GenParser Char st (ListNumberStyle, Int) +defaultNum = do +  char '#' +  return (DefaultStyle, 1) + +-- | Parses a lowercase letter and returns (LowerAlpha, number). +lowerAlpha :: GenParser Char st (ListNumberStyle, Int) +lowerAlpha = do +  ch <- oneOf ['a'..'z'] +  return (LowerAlpha, ord ch - ord 'a' + 1) + +-- | Parses an uppercase letter and returns (UpperAlpha, number). +upperAlpha :: GenParser Char st (ListNumberStyle, Int) +upperAlpha = do +  ch <- oneOf ['A'..'Z'] +  return (UpperAlpha, ord ch - ord 'A' + 1) + +-- | Parses a roman numeral i or I +romanOne :: GenParser Char st (ListNumberStyle, Int) +romanOne = (char 'i' >> return (LowerRoman, 1)) <|> +           (char 'I' >> return (UpperRoman, 1)) + +-- | Parses an ordered list marker and returns list attributes. +anyOrderedListMarker :: GenParser Char st ListAttributes  +anyOrderedListMarker = choice $  +  [delimParser numParser | delimParser <- [inPeriod, inOneParen, inTwoParens], +                           numParser <- [decimal, defaultNum, romanOne, +                           lowerAlpha, lowerRoman, upperAlpha, upperRoman]] + +-- | Parses a list number (num) followed by a period, returns list attributes. +inPeriod :: GenParser Char st (ListNumberStyle, Int) +         -> GenParser Char st ListAttributes  +inPeriod num = try $ do +  (style, start) <- num +  char '.' +  let delim = if style == DefaultStyle +                 then DefaultDelim +                 else Period +  return (start, style, delim) +  +-- | Parses a list number (num) followed by a paren, returns list attributes. +inOneParen :: GenParser Char st (ListNumberStyle, Int) +           -> GenParser Char st ListAttributes  +inOneParen num = try $ do +  (style, start) <- num +  char ')' +  return (start, style, OneParen) + +-- | Parses a list number (num) enclosed in parens, returns list attributes. +inTwoParens :: GenParser Char st (ListNumberStyle, Int) +            -> GenParser Char st ListAttributes  +inTwoParens num = try $ do +  char '(' +  (style, start) <- num +  char ')' +  return (start, style, TwoParens) + +-- | Parses an ordered list marker with a given style and delimiter, +-- returns number. +orderedListMarker :: ListNumberStyle  +                  -> ListNumberDelim  +                  -> GenParser Char st Int +orderedListMarker style delim = do +  let num = case style of +               DefaultStyle -> decimal <|> defaultNum +               Decimal      -> decimal +               UpperRoman   -> upperRoman +               LowerRoman   -> lowerRoman +               UpperAlpha   -> upperAlpha +               LowerAlpha   -> lowerAlpha +  let context = case delim of +               DefaultDelim -> inPeriod +               Period       -> inPeriod +               OneParen     -> inOneParen +               TwoParens    -> inTwoParens +  (start, _, _) <- context num +  return start + +-- | Parses a character reference and returns a Str element. +charRef :: GenParser Char st Inline +charRef = do +  c <- characterReference +  return $ Str [c] + +-- | Parse a string with a given parser and state. +readWith :: GenParser Char ParserState a      -- ^ parser +         -> ParserState                    -- ^ initial state +         -> String                         -- ^ input string +         -> a +readWith parser state input =  +    case runParser parser state "source" input of +      Left err     -> error $ "\nError:\n" ++ show err +      Right result -> result + +-- | Parse a string with @parser@ (for testing). +testStringWith :: (Show a) => GenParser Char ParserState a +               -> String +               -> IO () +testStringWith parser str = putStrLn $ show $  +                            readWith parser defaultParserState str + +-- | Parsing options. +data ParserState = ParserState +    { stateParseRaw        :: Bool,          -- ^ Parse raw HTML and LaTeX? +      stateParserContext   :: ParserContext, -- ^ Inside list? +      stateQuoteContext    :: QuoteContext,  -- ^ Inside quoted environment? +      stateKeys            :: KeyTable,      -- ^ List of reference keys +      stateNotes           :: NoteTable,     -- ^ List of notes +      stateTabStop         :: Int,           -- ^ Tab stop +      stateStandalone      :: Bool,          -- ^ Parse bibliographic info? +      stateTitle           :: [Inline],      -- ^ Title of document +      stateAuthors         :: [String],      -- ^ Authors of document +      stateDate            :: String,        -- ^ Date of document +      stateStrict          :: Bool,          -- ^ Use strict markdown syntax? +      stateSmart           :: Bool,          -- ^ Use smart typography? +      stateColumns         :: Int,           -- ^ Number of columns in terminal +      stateHeaderTable     :: [HeaderType]   -- ^ Ordered list of header types used +    } +    deriving Show + +defaultParserState :: ParserState +defaultParserState =  +    ParserState { stateParseRaw        = False, +                  stateParserContext   = NullState, +                  stateQuoteContext    = NoQuote, +                  stateKeys            = [], +                  stateNotes           = [], +                  stateTabStop         = 4, +                  stateStandalone      = False, +                  stateTitle           = [], +                  stateAuthors         = [], +                  stateDate            = [], +                  stateStrict          = False, +                  stateSmart           = False, +                  stateColumns         = 80, +                  stateHeaderTable     = [] } + +data HeaderType  +    = SingleHeader Char  -- ^ Single line of characters underneath +    | DoubleHeader Char  -- ^ Lines of characters above and below +    deriving (Eq, Show) + +data ParserContext  +    = ListItemState   -- ^ Used when running parser on list item contents +    | NullState       -- ^ Default state +    deriving (Eq, Show) + +data QuoteContext +    = InSingleQuote   -- ^ Used when parsing inside single quotes +    | InDoubleQuote   -- ^ Used when parsing inside double quotes +    | NoQuote         -- ^ Used when not parsing inside quotes +    deriving (Eq, Show) + +type NoteTable = [(String, [Block])] + +type KeyTable = [([Inline], Target)] + +-- | Look up key in key table and return target object. +lookupKeySrc :: KeyTable  -- ^ Key table +             -> [Inline]  -- ^ Key +             -> Maybe Target +lookupKeySrc table key = case find (refsMatch key . fst) table of +                           Nothing       -> Nothing +                           Just (_, src) -> Just src + +-- | Returns @True@ if keys match (case insensitive). +refsMatch :: [Inline] -> [Inline] -> Bool +refsMatch ((Str x):restx) ((Str y):resty) =  +    ((map toLower x) == (map toLower y)) && refsMatch restx resty +refsMatch ((Emph x):restx) ((Emph y):resty) =  +    refsMatch x y && refsMatch restx resty +refsMatch ((Strong x):restx) ((Strong y):resty) =  +    refsMatch x y && refsMatch restx resty +refsMatch ((Strikeout x):restx) ((Strikeout y):resty) =  +    refsMatch x y && refsMatch restx resty +refsMatch ((Superscript x):restx) ((Superscript y):resty) =  +    refsMatch x y && refsMatch restx resty +refsMatch ((Subscript x):restx) ((Subscript y):resty) =  +    refsMatch x y && refsMatch restx resty +refsMatch ((Quoted t x):restx) ((Quoted u y):resty) =  +    t == u && refsMatch x y && refsMatch restx resty +refsMatch ((Code x):restx) ((Code y):resty) =  +    ((map toLower x) == (map toLower y)) && refsMatch restx resty +refsMatch ((TeX x):restx) ((TeX y):resty) =  +    ((map toLower x) == (map toLower y)) && refsMatch restx resty +refsMatch ((HtmlInline x):restx) ((HtmlInline y):resty) =  +    ((map toLower x) == (map toLower y)) && refsMatch restx resty +refsMatch (x:restx) (y:resty) = (x == y) && refsMatch restx resty +refsMatch [] x = null x +refsMatch x [] = null x + +-- +-- Native format prettyprinting +-- +  +-- | Indent string as a block. +indentBy :: Int    -- ^ Number of spaces to indent the block  +         -> Int    -- ^ Number of spaces (rel to block) to indent first line +         -> String -- ^ Contents of block to indent +         -> String +indentBy _ _ [] = "" +indentBy num first str =  +  let (firstLine:restLines) = lines str  +      firstLineIndent = num + first +  in  (replicate firstLineIndent ' ') ++ firstLine ++ "\n" ++  +      (joinWithSep "\n" $ map ((replicate num ' ') ++ ) restLines) + +-- | Prettyprint list of Pandoc blocks elements. +prettyBlockList :: Int       -- ^ Number of spaces to indent list of blocks +                -> [Block]   -- ^ List of blocks +                -> String +prettyBlockList indent [] = indentBy indent 0 "[]" +prettyBlockList indent blocks = indentBy indent (-2) $ "[ " ++  +  (joinWithSep "\n, " (map prettyBlock blocks)) ++ " ]" + +-- | Prettyprint Pandoc block element. +prettyBlock :: Block -> String +prettyBlock (BlockQuote blocks) = "BlockQuote\n  " ++  +                                  (prettyBlockList 2 blocks)  +prettyBlock (OrderedList attribs blockLists) =  +  "OrderedList " ++ show attribs ++ "\n" ++ indentBy 2 0 ("[ " ++  +  (joinWithSep ", " $ map (\blocks -> prettyBlockList 2 blocks)  +  blockLists)) ++ " ]" +prettyBlock (BulletList blockLists) = "BulletList\n" ++  +  indentBy 2 0 ("[ " ++ (joinWithSep ", "  +  (map (\blocks -> prettyBlockList 2 blocks) blockLists))) ++ " ]"  +prettyBlock (DefinitionList blockLists) = "DefinitionList\n" ++  +  indentBy 2 0 ("[" ++ (joinWithSep ",\n"  +  (map (\(term, blocks) -> "  (" ++ show term ++ ",\n" ++  +  indentBy 1 2 (prettyBlockList 2 blocks) ++ "  )") blockLists))) ++ " ]"  +prettyBlock (Table caption aligns widths header rows) =  +  "Table " ++ show caption ++ " " ++ show aligns ++ " " ++  +  show widths ++ "\n" ++ prettyRow header ++ " [\n" ++   +  (joinWithSep ",\n" (map prettyRow rows)) ++ " ]" +  where prettyRow cols = indentBy 2 0 ("[ " ++ (joinWithSep ", " +                         (map (\blocks -> prettyBlockList 2 blocks)  +                         cols))) ++ " ]" +prettyBlock block = show block + +-- | Prettyprint Pandoc document. +prettyPandoc :: Pandoc -> String +prettyPandoc (Pandoc meta blocks) = "Pandoc " ++ "(" ++ show meta ++  +  ")\n" ++ (prettyBlockList 0 blocks) ++ "\n" + +-- +-- Pandoc block and inline list processing +-- + +-- | Generate infinite lazy list of markers for an ordered list, +-- depending on list attributes. +orderedListMarkers :: (Int, ListNumberStyle, ListNumberDelim) -> [String] +orderedListMarkers (start, numstyle, numdelim) =  +  let singleton c = [c] +      nums = case numstyle of +                     DefaultStyle -> map show [start..] +                     Decimal      -> map show [start..] +                     UpperAlpha   -> drop (start - 1) $ cycle $  +                                     map singleton ['A'..'Z'] +                     LowerAlpha   -> drop (start - 1) $ cycle $ +                                     map singleton ['a'..'z'] +                     UpperRoman   -> map toRomanNumeral [start..] +                     LowerRoman   -> map (map toLower . toRomanNumeral) [start..] +      inDelim str = case numdelim of +                            DefaultDelim -> str ++ "." +                            Period       -> str ++ "." +                            OneParen     -> str ++ ")" +                            TwoParens    -> "(" ++ str ++ ")" +  in  map inDelim nums + +-- | Normalize a list of inline elements: remove leading and trailing +-- @Space@ elements, collapse double @Space@s into singles, and +-- remove empty Str elements. +normalizeSpaces :: [Inline] -> [Inline] +normalizeSpaces [] = [] +normalizeSpaces list =  +    let removeDoubles [] = [] +        removeDoubles (Space:Space:rest) = removeDoubles (Space:rest) +        removeDoubles (Space:(Str ""):Space:rest) = removeDoubles (Space:rest) +        removeDoubles ((Str ""):rest) = removeDoubles rest  +        removeDoubles (x:rest) = x:(removeDoubles rest) +        removeLeading (Space:xs) = removeLeading xs +        removeLeading x = x +        removeTrailing [] = [] +        removeTrailing lst = if (last lst == Space) +                                then init lst +                                else lst +    in  removeLeading $ removeTrailing $ removeDoubles list + +-- | Change final list item from @Para@ to @Plain@ if the list should  +-- be compact. +compactify :: [[Block]]  -- ^ List of list items (each a list of blocks) +           -> [[Block]] +compactify [] = [] +compactify items = +    let final  = last items +        others = init items +    in  case final of +          [Para a]  -> if any containsPara others +                          then items +                          else others ++ [[Plain a]] +          _         -> items + +containsPara :: [Block] -> Bool +containsPara [] = False +containsPara ((Para _):_) = True +containsPara ((BulletList items):rest) =  any containsPara items || +                                          containsPara rest +containsPara ((OrderedList _ items):rest) = any containsPara items || +                                            containsPara rest +containsPara ((DefinitionList items):rest) = any containsPara (map snd items) || +                                             containsPara rest +containsPara (_:rest) = containsPara rest + +-- | Data structure for defining hierarchical Pandoc documents +data Element = Blk Block  +             | Sec [Inline] [Element] deriving (Eq, Read, Show) + +-- | Returns @True@ on Header block with at least the specified level +headerAtLeast :: Int -> Block -> Bool +headerAtLeast level (Header x _) = x <= level +headerAtLeast _ _ = False + +-- | Convert list of Pandoc blocks into (hierarchical) list of Elements +hierarchicalize :: [Block] -> [Element] +hierarchicalize [] = [] +hierarchicalize (block:rest) =  +  case block of +    (Header level title) ->  +         let (thisSection, rest') = break (headerAtLeast level) rest +         in  (Sec title (hierarchicalize thisSection)):(hierarchicalize rest')  +    x -> (Blk x):(hierarchicalize rest) + +-- | True if block is a Header block. +isHeaderBlock :: Block -> Bool +isHeaderBlock (Header _ _) = True +isHeaderBlock _ = False + +-- +-- Writer options +-- + +-- | Options for writers +data WriterOptions = WriterOptions +  { writerStandalone      :: Bool   -- ^ Include header and footer +  , writerHeader          :: String -- ^ Header for the document +  , writerTitlePrefix     :: String -- ^ Prefix for HTML titles +  , writerTabStop         :: Int    -- ^ Tabstop for conversion btw spaces and tabs +  , writerTableOfContents :: Bool   -- ^ Include table of contents +  , writerS5              :: Bool   -- ^ We're writing S5  +  , writerUseASCIIMathML  :: Bool   -- ^ Use ASCIIMathML +  , writerASCIIMathMLURL  :: Maybe String -- ^ URL to asciiMathML.js  +  , writerIgnoreNotes     :: Bool   -- ^ Ignore footnotes (used in making toc) +  , writerIncremental     :: Bool   -- ^ Incremental S5 lists +  , writerNumberSections  :: Bool   -- ^ Number sections in LaTeX +  , writerIncludeBefore   :: String -- ^ String to include before the body +  , writerIncludeAfter    :: String -- ^ String to include after the body +  , writerStrictMarkdown  :: Bool   -- ^ Use strict markdown syntax +  , writerReferenceLinks  :: Bool   -- ^ Use reference links in writing markdown, rst +  , writerWrapText        :: Bool   -- ^ Wrap text to line length +  } deriving Show + +-- | Default writer options. +defaultWriterOptions :: WriterOptions +defaultWriterOptions =  +  WriterOptions { writerStandalone      = False +                , writerHeader          = "" +                , writerTitlePrefix     = "" +                , writerTabStop         = 4 +                , writerTableOfContents = False +                , writerS5              = False +                , writerUseASCIIMathML  = False +                , writerASCIIMathMLURL  = Nothing +                , writerIgnoreNotes     = False +                , writerIncremental     = False +                , writerNumberSections  = False +                , writerIncludeBefore   = "" +                , writerIncludeAfter    = "" +                , writerStrictMarkdown  = False +                , writerReferenceLinks  = False +                , writerWrapText        = True +                } diff --git a/src/Text/Pandoc/UTF8.hs b/src/Text/Pandoc/UTF8.hs new file mode 100644 index 000000000..16bdb9218 --- /dev/null +++ b/src/Text/Pandoc/UTF8.hs @@ -0,0 +1,45 @@ +-- | Functions for converting Unicode strings to UTF-8 and vice versa. +--  +-- Taken from <http://www.cse.ogi.edu/~hallgren/Talks/LHiH/base/lib/UTF8.hs>. +-- (c) 2003, OGI School of Science & Engineering, Oregon Health and +-- Science University.   +-- +-- Modified by Martin Norbaeck +-- to pass illegal UTF-8 sequences through unchanged. +module Text.Pandoc.UTF8 (  +             fromUTF8,  +             toUTF8  +            ) where + +-- From the Char module supplied with HBC. + +-- | Take a UTF-8 string and decode it into a Unicode string. +fromUTF8 :: String -> String +fromUTF8 "" = "" +fromUTF8 ('\xef':'\xbb':'\xbf':cs) = fromUTF8 cs -- skip BOM (byte order marker) +fromUTF8 (c:c':cs) | '\xc0' <= c  && c  <= '\xdf' &&  +		             '\x80' <= c' && c' <= '\xbf' = +	toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : fromUTF8 cs +fromUTF8 (c:c':c'':cs) | '\xe0' <= c   && c   <= '\xef' &&  +		          '\x80' <= c'  && c'  <= '\xbf' && +		          '\x80' <= c'' && c'' <= '\xbf' = +	toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : fromUTF8 cs +fromUTF8 (c:cs) = c : fromUTF8 cs + +-- | Take a Unicode string and encode it as a UTF-8 string. +toUTF8 :: String -> String +toUTF8 "" = "" +toUTF8 (c:cs) = +	if c > '\x0000' && c < '\x0080' then +	    c : toUTF8 cs +	else if c < toEnum 0x0800 then +	    let i = fromEnum c +	    in  toEnum (0xc0 + i `div` 0x40) :  +	        toEnum (0x80 + i `mod` 0x40) :  +		toUTF8 cs +	else +	    let i = fromEnum c +	    in  toEnum (0xe0 + i `div` 0x1000) :  +	        toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) :  +		toEnum (0x80 + i `mod` 0x40) :  +		toUTF8 cs diff --git a/src/Text/Pandoc/Writers/ConTeXt.hs b/src/Text/Pandoc/Writers/ConTeXt.hs new file mode 100644 index 000000000..13912a9f3 --- /dev/null +++ b/src/Text/Pandoc/Writers/ConTeXt.hs @@ -0,0 +1,248 @@ +{- +Copyright (C) 2007 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Writers.ConTeXt +   Copyright   : Copyright (C) 2007 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha  +   Portability : portable + +Conversion of 'Pandoc' format into ConTeXt. +-} +module Text.Pandoc.Writers.ConTeXt ( writeConTeXt ) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Printf ( printf ) +import Data.List ( (\\), intersperse ) +import Control.Monad.State + +type WriterState = Int -- number of next URL reference  + +-- | Convert Pandoc to ConTeXt. +writeConTeXt :: WriterOptions -> Pandoc -> String +writeConTeXt options document = evalState (pandocToConTeXt options document) 1  + +pandocToConTeXt :: WriterOptions -> Pandoc -> State WriterState String +pandocToConTeXt options (Pandoc meta blocks) = do +  main    <- blockListToConTeXt blocks  +  let body = writerIncludeBefore options ++ main ++ writerIncludeAfter options +  head    <- if writerStandalone options +                then contextHeader options meta +                else return "" +  let toc  = if writerTableOfContents options +                then "\\placecontent\n\n" +                else ""  +  let foot = if writerStandalone options +                then "\n\\stoptext\n" +                else "" +  return $ head ++ toc ++ body ++ foot + +-- | Insert bibliographic information into ConTeXt header. +contextHeader :: WriterOptions -- ^ Options, including ConTeXt header +              -> Meta          -- ^ Meta with bibliographic information +              -> State WriterState String +contextHeader options (Meta title authors date) = do +  titletext    <- if null title +                     then return ""  +                     else inlineListToConTeXt title +  let authorstext = if null authors +                       then "" +                       else if length authors == 1 +                            then stringToConTeXt $ head authors +                            else stringToConTeXt $ (joinWithSep ", " $ +                                 init authors) ++ " & " ++ last authors +  let datetext   = if date == "" +                       then ""  +                       else stringToConTeXt date +  let titleblock = "\\doctitle{" ++ titletext ++ "}\n\ +                   \ \\author{" ++ authorstext ++ "}\n\ +                   \ \\date{" ++ datetext ++ "}\n\n" +  let setupheads = if (writerNumberSections options) +                      then "\\setupheads[sectionnumber=yes, style=\\bf]\n"  +                      else "\\setupheads[sectionnumber=no, style=\\bf]\n" +  let header     = writerHeader options +  return $ header ++ setupheads ++ titleblock ++ "\\starttext\n\\maketitle\n\n" + +-- escape things as needed for ConTeXt + +escapeCharForConTeXt :: Char -> String +escapeCharForConTeXt ch = + case ch of +    '{'  -> "\\letteropenbrace{}" +    '}'  -> "\\letterclosebrace{}" +    '\\' -> "\\letterbackslash{}" +    '$'  -> "\\$" +    '|'  -> "\\letterbar{}" +    '^'  -> "\\letterhat{}" +    '%'  -> "\\%" +    '~'  -> "\\lettertilde{}" +    '&'  -> "\\&" +    '#'  -> "\\#" +    '<'  -> "\\letterless{}" +    '>'  -> "\\lettermore{}" +    '_'  -> "\\letterunderscore{}" +    x    -> [x] + +-- | Escape string for ConTeXt +stringToConTeXt :: String -> String +stringToConTeXt = concatMap escapeCharForConTeXt + +-- | Convert Pandoc block element to ConTeXt. +blockToConTeXt :: Block -> State WriterState String  +blockToConTeXt Null = return "" +blockToConTeXt (Plain lst) = inlineListToConTeXt lst >>= return . (++ "\n") +blockToConTeXt (Para lst) = inlineListToConTeXt lst >>= return . (++ "\n\n") +blockToConTeXt (BlockQuote lst) = do +  contents <- blockListToConTeXt lst +  return $ "\\startblockquote\n" ++ contents ++ "\\stopblockquote\n\n" +blockToConTeXt (CodeBlock str) =  +  return $ "\\starttyping\n" ++ str ++ "\n\\stoptyping\n" +blockToConTeXt (RawHtml str) = return "" +blockToConTeXt (BulletList lst) = do  +  contents <- mapM listItemToConTeXt lst +  return $ "\\startltxitem\n" ++ concat contents ++ "\\stopltxitem\n" +blockToConTeXt (OrderedList attribs lst) = case attribs of +  (1, DefaultStyle, DefaultDelim) -> do +    contents <- mapM listItemToConTeXt lst +    return $  "\\startltxenum\n" ++ concat contents ++ "\\stopltxenum\n" +  _ -> do +    let markers = take (length lst) $ orderedListMarkers attribs +    contents <- zipWithM orderedListItemToConTeXt markers lst +    let markerWidth = maximum $ map length markers  +    let markerWidth' = if markerWidth < 3 +                          then "" +                          else "[width=" ++  +                               show ((markerWidth + 2) `div` 2)  ++ "em]" +    return $ "\\startitemize" ++ markerWidth' ++ "\n" ++ concat contents ++  +             "\\stopitemize\n" +blockToConTeXt (DefinitionList lst) = +  mapM defListItemToConTeXt lst >>= return . (++ "\n") . concat +blockToConTeXt HorizontalRule = return "\\thinrule\n\n" +blockToConTeXt (Header level lst) = do +  contents <- inlineListToConTeXt lst +  return $ if level > 0 && level <= 3 +              then "\\" ++ concat (replicate (level - 1) "sub") ++  +                   "section{" ++ contents ++ "}\n\n" +              else contents ++ "\n\n" +blockToConTeXt (Table caption aligns widths heads rows) = do +    let colWidths = map printDecimal widths +    let colDescriptor colWidth alignment = (case alignment of +                                               AlignLeft    -> 'l'  +                                               AlignRight   -> 'r' +                                               AlignCenter  -> 'c' +                                               AlignDefault -> 'l'): +                                           "p(" ++ colWidth ++ "\\textwidth)|" +    let colDescriptors = "|" ++ (concat $  +                                 zipWith colDescriptor colWidths aligns) +    headers <- tableRowToConTeXt heads  +    captionText <- inlineListToConTeXt caption  +    let captionText' = if null caption then "none" else captionText +    rows' <- mapM tableRowToConTeXt rows  +    return $ "\\placetable[here]{" ++ captionText' ++ "}\n\\starttable[" ++  +             colDescriptors ++ "]\n" ++ "\\HL\n" ++ headers ++ "\\HL\n" ++  +             concat rows' ++ "\\HL\n\\stoptable\n\n"  + +printDecimal :: Float -> String +printDecimal = printf "%.2f"  + +tableRowToConTeXt cols = do +  cols' <- mapM blockListToConTeXt cols +  return $ "\\NC " ++ (concat $ intersperse "\\NC " cols') ++ "\\NC\\AR\n" + +listItemToConTeXt list = do +  contents <- blockListToConTeXt list +  return $ "\\item " ++ contents + +orderedListItemToConTeXt marker list = do +  contents <- blockListToConTeXt list +  return $ "\\sym{" ++ marker ++ "} " ++ contents + +defListItemToConTeXt (term, def) = do +  term' <- inlineListToConTeXt term +  def'  <- blockListToConTeXt def +  return $ "\\startdescr{" ++ term' ++ "}\n" ++ +           def' ++ "\n\\stopdescr\n" + +-- | Convert list of block elements to ConTeXt. +blockListToConTeXt :: [Block] -> State WriterState String +blockListToConTeXt lst = mapM blockToConTeXt lst >>= return . concat + +-- | Convert list of inline elements to ConTeXt. +inlineListToConTeXt :: [Inline]  -- ^ Inlines to convert +                    -> State WriterState String +inlineListToConTeXt lst = mapM inlineToConTeXt lst >>= return . concat + +isQuoted :: Inline -> Bool +isQuoted (Quoted _ _) = True +isQuoted Apostrophe = True +isQuoted _ = False + +-- | Convert inline element to ConTeXt +inlineToConTeXt :: Inline    -- ^ Inline to convert +                -> State WriterState String +inlineToConTeXt (Emph lst) = do +  contents <- inlineListToConTeXt lst +  return $ "{\\em " ++ contents ++ "}"  +inlineToConTeXt (Strong lst) = do +  contents <- inlineListToConTeXt lst +  return $ "{\\bf " ++ contents ++ "}"  +inlineToConTeXt (Strikeout lst) = do +  contents <- inlineListToConTeXt lst +  return $ "\\overstrikes{" ++ contents ++ "}"  +inlineToConTeXt (Superscript lst) = do +  contents <- inlineListToConTeXt lst +  return $ "\\high{" ++ contents ++ "}"  +inlineToConTeXt (Subscript lst) = do +  contents <- inlineListToConTeXt lst +  return $ "\\low{" ++ contents ++ "}"  +inlineToConTeXt (Code str) = return $ "\\type{" ++ str ++ "}" +inlineToConTeXt (Quoted SingleQuote lst) = do +  contents <- inlineListToConTeXt lst +  return $ "\\quote{" ++ contents ++ "}" +inlineToConTeXt (Quoted DoubleQuote lst) = do +  contents <- inlineListToConTeXt lst +  return $ "\\quotation{" ++ contents ++ "}" +inlineToConTeXt Apostrophe = return "'" +inlineToConTeXt EmDash = return "---" +inlineToConTeXt EnDash = return "--" +inlineToConTeXt Ellipses = return "\\ldots{}" +inlineToConTeXt (Str str) = return $ stringToConTeXt str +inlineToConTeXt (TeX str) = return str +inlineToConTeXt (HtmlInline str) = return "" +inlineToConTeXt (LineBreak) = return "\\crlf\n" +inlineToConTeXt Space = return " " +inlineToConTeXt (Link [Code str] (src, tit)) = -- since ConTeXt has its own  +  inlineToConTeXt (Link [Str str] (src, tit))  -- way of printing links...  +inlineToConTeXt (Link text (src, _)) = do +  next <- get +  put (next + 1) +  let ref = show next +  label <- inlineListToConTeXt text +  return $ "\\useurl[" ++ ref ++ "][" ++ src ++ "][][" ++ label ++  +           "]\\from[" ++ ref ++ "]"  +inlineToConTeXt (Image alternate (src, tit)) = do +  alt <- inlineListToConTeXt alternate +  return $ "\\placefigure\n[]\n[fig:" ++ alt ++ "]\n{" ++ +           tit ++ "}\n{\\externalfigure[" ++ src ++ "]}"  +inlineToConTeXt (Note contents) = do +  contents' <- blockListToConTeXt contents +  return $ "\\footnote{" ++ contents' ++ "}" + diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs new file mode 100644 index 000000000..13dc8585d --- /dev/null +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -0,0 +1,299 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Writers.Docbook +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +Conversion of 'Pandoc' documents to Docbook XML. +-} +module Text.Pandoc.Writers.Docbook ( writeDocbook) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Data.List ( isPrefixOf, drop ) +import Text.PrettyPrint.HughesPJ hiding ( Str ) + +-- +-- code to format XML +-- + +-- | Escape one character as needed for XML. +escapeCharForXML :: Char -> String +escapeCharForXML x = case x of +                       '&'  -> "&" +                       '<'  -> "<" +                       '>'  -> ">" +                       '"'  -> """ +                       '\160' -> " " +                       c    -> [c]  + +-- | True if the character needs to be escaped. +needsEscaping :: Char -> Bool +needsEscaping c = c `elem` "&<>\"\160" + +-- | Escape string as needed for XML.  Entity references are not preserved. +escapeStringForXML :: String -> String +escapeStringForXML ""  = "" +escapeStringForXML str =  +  case break needsEscaping str of +    (okay, "")     -> okay +    (okay, (c:cs)) -> okay ++ escapeCharForXML c ++ escapeStringForXML cs  + +-- | Return a text object with a string of formatted XML attributes.  +attributeList :: [(String, String)] -> Doc +attributeList = text .  concatMap  +  (\(a, b) -> " " ++ escapeStringForXML a ++ "=\"" ++  +  escapeStringForXML b ++ "\"")  + +-- | Put the supplied contents between start and end tags of tagType, +--   with specified attributes and (if specified) indentation. +inTags:: Bool -> String -> [(String, String)] -> Doc -> Doc +inTags isIndented tagType attribs contents =  +  let openTag = char '<' <> text tagType <> attributeList attribs <>  +                char '>' +      closeTag  = text "</" <> text tagType <> char '>' +  in  if isIndented +         then openTag $$ nest 2 contents $$ closeTag +         else openTag <> contents <> closeTag + +-- | Return a self-closing tag of tagType with specified attributes +selfClosingTag :: String -> [(String, String)] -> Doc +selfClosingTag tagType attribs =  +  char '<' <> text tagType <> attributeList attribs <> text " />"  +  +-- | Put the supplied contents between start and end tags of tagType. +inTagsSimple :: String -> Doc -> Doc +inTagsSimple tagType = inTags False tagType [] + +-- | Put the supplied contents in indented block btw start and end tags. +inTagsIndented :: String -> Doc -> Doc +inTagsIndented tagType = inTags True tagType [] + +-- +-- Docbook writer +-- + +-- | Convert list of authors to a docbook <author> section +authorToDocbook :: [Char] -> Doc +authorToDocbook name = inTagsIndented "author" $  +  if ',' `elem` name +    then -- last name first +         let (lastname, rest) = break (==',') name  +             firstname = removeLeadingSpace rest in +         inTagsSimple "firstname" (text $ escapeStringForXML firstname) <>  +         inTagsSimple "surname" (text $ escapeStringForXML lastname)  +    else -- last name last +         let namewords = words name +             lengthname = length namewords  +             (firstname, lastname) = case lengthname of +               0  -> ("","")  +               1  -> ("", name) +               n  -> (joinWithSep " " (take (n-1) namewords), last namewords) +          in inTagsSimple "firstname" (text $ escapeStringForXML firstname) $$  +             inTagsSimple "surname" (text $ escapeStringForXML lastname)  + +-- | Convert Pandoc document to string in Docbook format. +writeDocbook :: WriterOptions -> Pandoc -> String +writeDocbook opts (Pandoc (Meta title authors date) blocks) =  +  let head     = if writerStandalone opts +                    then text (writerHeader opts) +                    else empty +      meta     = if writerStandalone opts +                    then inTagsIndented "articleinfo" $ +                         (inTagsSimple "title" (wrap opts title)) $$  +                         (vcat (map authorToDocbook authors)) $$  +                         (inTagsSimple "date" (text $ escapeStringForXML date))  +                    else empty +      elements = hierarchicalize blocks +      before   = writerIncludeBefore opts +      after    = writerIncludeAfter opts +      body     = (if null before then empty else text before) $$ +                 vcat (map (elementToDocbook opts) elements) $$ +                 (if null after then empty else text after) +      body'    = if writerStandalone opts +                   then inTagsIndented "article" (meta $$ body) +                   else body  +  in  render $ head $$ body' $$ text "" + +-- | Convert an Element to Docbook. +elementToDocbook :: WriterOptions -> Element -> Doc +elementToDocbook opts (Blk block) = blockToDocbook opts block  +elementToDocbook opts (Sec title elements) = +  -- Docbook doesn't allow sections with no content, so insert some if needed +  let elements' = if null elements +                    then [Blk (Para [])] +                    else elements +  in  inTagsIndented "section" $ +      inTagsSimple "title" (wrap opts title) $$ +      vcat (map (elementToDocbook opts) elements')  + +-- | Convert a list of Pandoc blocks to Docbook. +blocksToDocbook :: WriterOptions -> [Block] -> Doc +blocksToDocbook opts = vcat . map (blockToDocbook opts) + +-- | Auxiliary function to convert Plain block to Para. +plainToPara (Plain x) = Para x +plainToPara x = x + +-- | Convert a list of pairs of terms and definitions into a list of  +-- Docbook varlistentrys. +deflistItemsToDocbook :: WriterOptions -> [([Inline],[Block])] -> Doc +deflistItemsToDocbook opts items =  +  vcat $ map (\(term, def) -> deflistItemToDocbook opts term def) items + +-- | Convert a term and a list of blocks into a Docbook varlistentry. +deflistItemToDocbook :: WriterOptions -> [Inline] -> [Block] -> Doc +deflistItemToDocbook opts term def = +  let def' = map plainToPara def +  in  inTagsIndented "varlistentry" $ +      inTagsIndented "term" (inlinesToDocbook opts term) $$ +      inTagsIndented "listitem" (blocksToDocbook opts def') + +-- | Convert a list of lists of blocks to a list of Docbook list items. +listItemsToDocbook :: WriterOptions -> [[Block]] -> Doc +listItemsToDocbook opts items = vcat $ map (listItemToDocbook opts) items + +-- | Convert a list of blocks into a Docbook list item. +listItemToDocbook :: WriterOptions -> [Block] -> Doc +listItemToDocbook opts item = +  inTagsIndented "listitem" $ blocksToDocbook opts $ map plainToPara item + +-- | Convert a Pandoc block element to Docbook. +blockToDocbook :: WriterOptions -> Block -> Doc +blockToDocbook opts Null = empty +blockToDocbook opts (Plain lst) = wrap opts lst +blockToDocbook opts (Para lst) = inTagsIndented "para" $ wrap opts lst +blockToDocbook opts (BlockQuote blocks) = +  inTagsIndented "blockquote" $ blocksToDocbook opts blocks +blockToDocbook opts (CodeBlock str) =  +  text "<screen>\n" <> text (escapeStringForXML str) <> text "\n</screen>" +blockToDocbook opts (BulletList lst) =  +  inTagsIndented "itemizedlist" $ listItemsToDocbook opts lst  +blockToDocbook opts (OrderedList _ []) = empty  +blockToDocbook opts (OrderedList (start, numstyle, numdelim) (first:rest)) = +  let attribs  = case numstyle of +                       DefaultStyle -> [] +                       Decimal      -> [("numeration", "arabic")] +                       UpperAlpha   -> [("numeration", "upperalpha")] +                       LowerAlpha   -> [("numeration", "loweralpha")] +                       UpperRoman   -> [("numeration", "upperroman")] +                       LowerRoman   -> [("numeration", "lowerroman")] +      items    = if start == 1 +                    then listItemsToDocbook opts (first:rest) +                    else (inTags True "listitem" [("override",show start)] +                         (blocksToDocbook opts $ map plainToPara first)) $$  +                         listItemsToDocbook opts rest  +  in  inTags True "orderedlist" attribs items +blockToDocbook opts (DefinitionList lst) =  +  inTagsIndented "variablelist" $ deflistItemsToDocbook opts lst  +blockToDocbook opts (RawHtml str) = text str -- raw XML block  +blockToDocbook opts HorizontalRule = empty -- not semantic +blockToDocbook opts (Table caption aligns widths headers rows) = +  let alignStrings = map alignmentToString aligns +      captionDoc   = if null caption +                      then empty +                      else inTagsIndented "caption"  +                           (inlinesToDocbook opts caption) +      tableType    = if isEmpty captionDoc then "informaltable" else "table" +  in  inTagsIndented tableType $ captionDoc $$ +     (colHeadsToDocbook opts alignStrings widths headers) $$  +     (vcat $ map (tableRowToDocbook opts alignStrings) rows) + +colHeadsToDocbook opts alignStrings widths headers = +  let heads = zipWith3 (\align width item ->  +              tableItemToDocbook opts "th" align width item)  +              alignStrings widths headers +  in  inTagsIndented "tr" $ vcat heads + +alignmentToString alignment = case alignment of +                                 AlignLeft -> "left" +                                 AlignRight -> "right" +                                 AlignCenter -> "center" +                                 AlignDefault -> "left" + +tableRowToDocbook opts aligns cols = inTagsIndented "tr" $  +  vcat $ zipWith3 (tableItemToDocbook opts "td") aligns (repeat 0) cols + +tableItemToDocbook opts tag align width item = +  let attrib = [("align", align)] ++  +               if width /= 0 +                  then [("style", "{width: " ++  +                        show (truncate (100*width)) ++ "%;}")] +                  else []  +  in  inTags True tag attrib $ vcat $ map (blockToDocbook opts) item + +-- | Take list of inline elements and return wrapped doc. +wrap :: WriterOptions -> [Inline] -> Doc +wrap opts lst = if writerWrapText opts +                   then fsep $ map (inlinesToDocbook opts) (splitBy Space lst) +                   else inlinesToDocbook opts lst + +-- | Convert a list of inline elements to Docbook. +inlinesToDocbook :: WriterOptions -> [Inline] -> Doc +inlinesToDocbook opts lst = hcat $ map (inlineToDocbook opts) lst + +-- | Convert an inline element to Docbook. +inlineToDocbook :: WriterOptions -> Inline -> Doc +inlineToDocbook opts (Str str) = text $ escapeStringForXML str  +inlineToDocbook opts (Emph lst) =  +  inTagsSimple "emphasis" $ inlinesToDocbook opts lst +inlineToDocbook opts (Strong lst) =  +  inTags False "emphasis" [("role", "strong")] $ inlinesToDocbook opts lst +inlineToDocbook opts (Strikeout lst) =  +  inTags False "emphasis" [("role", "strikethrough")] $ +  inlinesToDocbook opts lst +inlineToDocbook opts (Superscript lst) =  +  inTagsSimple "superscript" $ inlinesToDocbook opts lst +inlineToDocbook opts (Subscript lst) =  +  inTagsSimple "subscript" $ inlinesToDocbook opts lst +inlineToDocbook opts (Quoted _ lst) =  +  inTagsSimple "quote" $ inlinesToDocbook opts lst +inlineToDocbook opts Apostrophe = char '\'' +inlineToDocbook opts Ellipses = text "…" +inlineToDocbook opts EmDash = text "—"  +inlineToDocbook opts EnDash = text "–"  +inlineToDocbook opts (Code str) =  +  inTagsSimple "literal" $ text (escapeStringForXML str) +inlineToDocbook opts (TeX str) = inlineToDocbook opts (Code str) +inlineToDocbook opts (HtmlInline str) = empty +inlineToDocbook opts LineBreak = text $ "<literallayout></literallayout>"  +inlineToDocbook opts Space = char ' ' +inlineToDocbook opts (Link txt (src, tit)) = +  if isPrefixOf "mailto:" src +     then let src' = drop 7 src +              emailLink = inTagsSimple "email" $ text $  +                          escapeStringForXML $ src' +          in  if txt == [Code src'] +                 then emailLink +                 else inlinesToDocbook opts txt <+> char '(' <> emailLink <>  +                      char ')' +     else inTags False "ulink" [("url", src)] $ inlinesToDocbook opts txt +inlineToDocbook opts (Image alt (src, tit)) =  +  let titleDoc = if null tit +                   then empty +                   else inTagsIndented "objectinfo" $ +                        inTagsIndented "title" (text $ escapeStringForXML tit) +  in  inTagsIndented "inlinemediaobject" $ inTagsIndented "imageobject" $ +      titleDoc $$ selfClosingTag "imagedata" [("fileref", src)]  +inlineToDocbook opts (Note contents) =  +  inTagsIndented "footnote" $ blocksToDocbook opts contents diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs new file mode 100644 index 000000000..7ec95d8ef --- /dev/null +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -0,0 +1,458 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Writers.HTML  +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +Conversion of 'Pandoc' documents to HTML. +-} +module Text.Pandoc.Writers.HTML ( writeHtml , writeHtmlString ) where +import Text.Pandoc.Definition +import Text.Pandoc.ASCIIMathML +import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) +import Text.Pandoc.Shared +import Text.Regex ( mkRegex, matchRegex ) +import Numeric ( showHex ) +import Data.Char ( ord, toLower ) +import Data.List ( isPrefixOf, intersperse ) +import qualified Data.Set as S +import Control.Monad.State +import Text.XHtml.Transitional + +data WriterState = WriterState +    { stNotes            :: [Html]       -- ^ List of notes +    , stIds              :: [String]     -- ^ List of header identifiers +    , stMath             :: Bool         -- ^ Math is used in document +    , stCSS              :: S.Set String -- ^ CSS to include in header +    } deriving Show + +defaultWriterState :: WriterState +defaultWriterState = WriterState {stNotes= [], stIds = [],  +                                  stMath = False, stCSS = S.empty} + +-- Helpers to render HTML with the appropriate function. +render opts    = if writerWrapText opts then renderHtml else showHtml +renderFragment opts = if writerWrapText opts +                         then renderHtmlFragment +                         else showHtmlFragment + +-- | Convert Pandoc document to Html string. +writeHtmlString :: WriterOptions -> Pandoc -> String +writeHtmlString opts =  +  if writerStandalone opts +     then render opts . writeHtml opts +     else renderFragment opts . writeHtml opts + +-- | Convert Pandoc document to Html structure. +writeHtml :: WriterOptions -> Pandoc -> Html +writeHtml opts (Pandoc (Meta tit authors date) blocks) =  +  let titlePrefix = writerTitlePrefix opts +      topTitle    = evalState (inlineListToHtml opts tit) defaultWriterState +      topTitle'   = if null titlePrefix +                        then topTitle +                        else titlePrefix +++ " - " +++ topTitle +      metadata    = thetitle topTitle' +++  +                    meta ! [httpequiv "Content-Type",  +                            content "text/html; charset=UTF-8"] +++ +                    meta ! [name "generator", content "pandoc"] +++ +                    (toHtmlFromList $  +                    map (\a -> meta ! [name "author", content a]) authors) +++ +                    (if null date +                       then noHtml +                       else meta ! [name "date", content date]) +      titleHeader = if writerStandalone opts && not (null tit) &&  +                    not (writerS5 opts) +                        then h1 ! [theclass "title"] $ topTitle +                        else noHtml +      headerBlocks = filter isHeaderBlock blocks +      ids          = uniqueIdentifiers $  +                     map (\(Header _ lst) -> lst) headerBlocks +      toc          = if writerTableOfContents opts  +                        then tableOfContents opts headerBlocks ids +                        else noHtml +      (blocks', newstate) =  +                     runState (blockListToHtml opts blocks) +                     (defaultWriterState {stIds = ids}) +      cssLines     = stCSS newstate +      css          = if S.null cssLines +                        then noHtml +                        else style ! [thetype "text/css"] $ primHtml $ +                             '\n':(unlines $ S.toList cssLines) +      math         = if stMath newstate +                        then case writerASCIIMathMLURL opts of +                                   Just path -> script !  [src path,  +                                                thetype "text/javascript"] $ +                                                noHtml +                                   Nothing   -> primHtml asciiMathMLScript +                        else noHtml +      head         = header $ metadata +++ math +++ css +++  +                              primHtml (writerHeader opts) +      notes        = reverse (stNotes newstate) +      before       = primHtml $ writerIncludeBefore opts +      after        = primHtml $ writerIncludeAfter opts +      thebody      = before +++ titleHeader +++ toc +++ blocks' +++ +                     footnoteSection opts notes +++ after +  in  if writerStandalone opts +         then head +++ body thebody +         else thebody + +-- | Construct table of contents from list of header blocks and identifiers. +-- Assumes there are as many identifiers as header blocks. +tableOfContents :: WriterOptions -> [Block] -> [String] -> Html +tableOfContents _ [] _ = noHtml +tableOfContents opts headers ids = +  let opts'        = opts { writerIgnoreNotes = True } +      contentsTree = hierarchicalize headers +      contents     = evalState (mapM (elementToListItem opts') contentsTree)  +                     (defaultWriterState {stIds = ids}) +  in  thediv ! [identifier "toc"] $ unordList contents + +-- | Converts an Element to a list item for a table of contents, +-- retrieving the appropriate identifier from state. +elementToListItem :: WriterOptions -> Element -> State WriterState Html +elementToListItem opts (Blk _) = return noHtml +elementToListItem opts (Sec headerText subsecs) = do +  st <- get +  let ids = stIds st +  let (id, rest) = if null ids +                      then ("", []) +                      else (head ids, tail ids) +  put $ st {stIds = rest} +  txt <- inlineListToHtml opts headerText +  subHeads <- mapM (elementToListItem opts) subsecs +  let subList = if null subHeads +                   then noHtml +                   else unordList subHeads  +  return $ (anchor ! [href ("#" ++ id), identifier ("TOC-" ++ id)] $ txt) +++  +           subList + +-- | Convert list of Note blocks to a footnote <div>. +-- Assumes notes are sorted. +footnoteSection :: WriterOptions -> [Html] -> Html +footnoteSection opts notes = +  if null notes  +     then noHtml +     else thediv ! [theclass "footnotes"] $ hr +++ (olist << notes) + +-- | Obfuscate a "mailto:" link using Javascript. +obfuscateLink :: WriterOptions -> String -> String -> Html +obfuscateLink opts text src = +  let emailRegex = mkRegex "^mailto:([^@]*)@(.*)$" +      src'       = map toLower src +  in  case (matchRegex emailRegex src') of +        (Just [name, domain]) -> +          let domain'  = substitute "." " dot " domain +              at'      = obfuscateChar '@' +              (linkText, altText) =  +                 if text == drop 7 src' -- autolink +                    then ("'<code>'+e+'</code>'", name ++ " at " ++ domain') +                    else ("'" ++ text ++ "'", text ++ " (" ++ name ++ " at " ++  +                          domain' ++ ")") +          in  if writerStrictMarkdown opts +                then -- need to use primHtml or &'s are escaped to & in URL +                     primHtml $ "<a href=\"" ++ (obfuscateString src') +                     ++ "\">" ++ (obfuscateString text) ++ "</a>" +                else (script ! [thetype "text/javascript"] $ +                     primHtml ("\n<!--\nh='" ++  +                     obfuscateString domain ++ "';a='" ++ at' ++ "';n='" ++  +                     obfuscateString name ++ "';e=n+a+h;\n" ++ +                     "document.write('<a h'+'ref'+'=\"ma'+'ilto'+':'+e+'\">'+" ++  +                     linkText  ++ "+'<\\/'+'a'+'>');\n// -->\n")) +++   +                     noscript (primHtml $ obfuscateString altText) +        _ -> anchor ! [href src] $ primHtml text  -- malformed email + +-- | Obfuscate character as entity. +obfuscateChar :: Char -> String +obfuscateChar char =  +  let num    = ord char +      numstr = if even num then show num else "x" ++ showHex num "" +  in  "&#" ++ numstr ++ ";" + +-- | Obfuscate string using entities. +obfuscateString :: String -> String +obfuscateString = concatMap obfuscateChar . decodeCharacterReferences + +-- | True if character is a punctuation character (unicode). +isPunctuation :: Char -> Bool +isPunctuation c = +  let c' = ord c +  in  if c `elem` "!\"'()*,-./:;<>?[\\]`{|}~" || c' >= 0x2000 && c' <= 0x206F || +         c' >= 0xE000 && c' <= 0xE0FF +         then True +         else False + +-- | Add CSS for document header. +addToCSS :: String -> State WriterState () +addToCSS item = do +  st <- get +  let current = stCSS st +  put $ st {stCSS = S.insert item current} + +-- | Convert Pandoc inline list to plain text identifier. +inlineListToIdentifier :: [Inline] -> String +inlineListToIdentifier [] = "" +inlineListToIdentifier (x:xs) =  +  xAsText ++ inlineListToIdentifier xs +  where xAsText = case x of +          Str s          -> filter (\c -> c == '-' || not (isPunctuation c)) $ +                            concat $ intersperse "-" $ words $ map toLower s +          Emph lst       -> inlineListToIdentifier lst +          Strikeout lst  -> inlineListToIdentifier lst +          Superscript lst -> inlineListToIdentifier lst +          Subscript lst  -> inlineListToIdentifier lst +          Strong lst     -> inlineListToIdentifier lst +          Quoted _ lst   -> inlineListToIdentifier lst +          Code s         -> s +          Space          -> "-" +          EmDash         -> "-" +          EnDash         -> "-" +          Apostrophe     -> "" +          Ellipses       -> "" +          LineBreak      -> "-" +          TeX _          -> "" +          HtmlInline _   -> "" +          Link lst _     -> inlineListToIdentifier lst +          Image lst _    -> inlineListToIdentifier lst +          Note _         -> "" + +-- | Return unique identifiers for list of inline lists. +uniqueIdentifiers :: [[Inline]] -> [String] +uniqueIdentifiers ls = +  let addIdentifier (nonuniqueIds, uniqueIds) l = +        let new = inlineListToIdentifier l +            matches = length $ filter (== new) nonuniqueIds +            new' = new ++ if matches > 0 then ("-" ++ show matches) else "" +        in  (new:nonuniqueIds, new':uniqueIds) +  in  reverse $ snd $ foldl addIdentifier ([],[]) ls + +-- | Convert Pandoc block element to HTML. +blockToHtml :: WriterOptions -> Block -> State WriterState Html +blockToHtml opts Null = return $ noHtml  +blockToHtml opts (Plain lst) = inlineListToHtml opts lst +blockToHtml opts (Para lst) = inlineListToHtml opts lst >>= (return . paragraph) +blockToHtml opts (RawHtml str) = return $ primHtml str +blockToHtml opts (HorizontalRule) = return $ hr +blockToHtml opts (CodeBlock str) = return $ pre $ thecode << (str ++ "\n")  +                       -- the final \n for consistency with Markdown.pl +blockToHtml opts (BlockQuote blocks) = +  -- in S5, treat list in blockquote specially +  -- if default is incremental, make it nonincremental;  +  -- otherwise incremental +  if writerS5 opts +     then let inc = not (writerIncremental opts) in +          case blocks of  +             [BulletList lst]  -> blockToHtml (opts {writerIncremental = inc}) +                                  (BulletList lst) +             [OrderedList attribs lst] ->  +                                  blockToHtml (opts {writerIncremental = inc}) +                                  (OrderedList attribs lst) +             otherwise         -> blockListToHtml opts blocks >>=  +                                  (return . blockquote) +     else blockListToHtml opts blocks >>= (return . blockquote) +blockToHtml opts (Header level lst) = do  +  contents <- inlineListToHtml opts lst +  st <- get +  let ids = stIds st +  let (id, rest) = if null ids +                      then ("", []) +                      else (head ids, tail ids) +  put $ st {stIds = rest} +  let attribs = if writerStrictMarkdown opts && not (writerTableOfContents opts) +                   then [] +                   else [identifier id] +  let contents'  = if writerTableOfContents opts +                      then anchor ! [href ("#TOC-" ++ id)] $ contents +                      else contents +  return $ case level of +              1 -> h1 contents' ! attribs +              2 -> h2 contents' ! attribs +              3 -> h3 contents' ! attribs +              4 -> h4 contents' ! attribs +              5 -> h5 contents' ! attribs +              6 -> h6 contents' ! attribs +              _ -> paragraph contents' ! attribs +blockToHtml opts (BulletList lst) = do +  contents <- mapM (blockListToHtml opts) lst +  let attribs = if writerIncremental opts +                   then [theclass "incremental"] +                   else [] +  return $ unordList ! attribs $ contents +blockToHtml opts (OrderedList (startnum, numstyle, _) lst) = do +  contents <- mapM (blockListToHtml opts) lst +  let numstyle' = camelCaseToHyphenated $ show numstyle +  let attribs = (if writerIncremental opts +                   then [theclass "incremental"] +                   else []) ++ +                (if startnum /= 1 +                   then [start startnum] +                   else []) ++ +                (if numstyle /= DefaultStyle +                   then [theclass numstyle'] +                   else []) +  if numstyle /= DefaultStyle +     then addToCSS $ "ol." ++ numstyle' ++  +                     " { list-style-type: " ++  +                     numstyle' ++ "; }" +     else return () +  return $ ordList ! attribs $ contents +blockToHtml opts (DefinitionList lst) = do +  contents <- mapM (\(term, def) -> do term' <- inlineListToHtml opts term +                                       def' <- blockListToHtml opts def +                                       return $ (term', def')) lst +  let attribs = if writerIncremental opts +                   then [theclass "incremental"] +                   else [] +  return $ defList ! attribs $ contents +blockToHtml opts (Table capt aligns widths headers rows) = do +  let alignStrings = map alignmentToString aligns +  captionDoc <- if null capt +                   then return noHtml +                   else inlineListToHtml opts capt >>= return . caption +  colHeads <- colHeadsToHtml opts alignStrings  +                             widths headers +  rows' <- mapM (tableRowToHtml opts alignStrings) rows +  return $ table $ captionDoc +++ colHeads +++ rows' + +colHeadsToHtml opts alignStrings widths headers = do +  heads <- sequence $ zipWith3  +           (\align width item -> tableItemToHtml opts th align width item)  +           alignStrings widths headers +  return $ tr $ toHtmlFromList heads + +alignmentToString alignment = case alignment of +                                 AlignLeft    -> "left" +                                 AlignRight   -> "right" +                                 AlignCenter  -> "center" +                                 AlignDefault -> "left" + +tableRowToHtml opts aligns cols =  +  (sequence $ zipWith3 (tableItemToHtml opts td) aligns (repeat 0) cols) >>= +  return . tr . toHtmlFromList + +tableItemToHtml opts tag align' width item = do +  contents <- blockListToHtml opts item +  let attrib = [align align'] ++  +               if width /= 0  +                  then [thestyle ("width: " ++ show (truncate (100*width)) ++  +                        "%;")] +                  else []  +  return $ tag ! attrib $ contents + +blockListToHtml :: WriterOptions -> [Block] -> State WriterState Html +blockListToHtml opts lst =  +  mapM (blockToHtml opts) lst >>= return . toHtmlFromList + +-- | Convert list of Pandoc inline elements to HTML. +inlineListToHtml :: WriterOptions -> [Inline] -> State WriterState Html +inlineListToHtml opts lst =  +  mapM (inlineToHtml opts) lst >>= return . toHtmlFromList + +-- | Convert Pandoc inline element to HTML. +inlineToHtml :: WriterOptions -> Inline -> State WriterState Html +inlineToHtml opts inline = +  case inline of   +    (Str str)        -> return $ stringToHtml str +    (Space)          -> return $ stringToHtml " " +    (LineBreak)      -> return $ br +    (EmDash)         -> return $ primHtmlChar "mdash" +    (EnDash)         -> return $ primHtmlChar "ndash" +    (Ellipses)       -> return $ primHtmlChar "hellip" +    (Apostrophe)     -> return $ primHtmlChar "rsquo" +    (Emph lst)       -> inlineListToHtml opts lst >>= return . emphasize +    (Strong lst)     -> inlineListToHtml opts lst >>= return . strong +    (Code str)       -> return $ thecode << str +    (Strikeout lst)  -> addToCSS  +                        ".strikeout { text-decoration: line-through; }" >>  +                        inlineListToHtml opts lst >>= +                        return . (thespan ! [theclass "strikeout"]) +    (Superscript lst) -> inlineListToHtml opts lst >>= return . sup +    (Subscript lst)   -> inlineListToHtml opts lst >>= return . sub +    (Quoted quoteType lst) -> +                        let (leftQuote, rightQuote) = case quoteType of +                              SingleQuote -> (primHtmlChar "lsquo",  +                                              primHtmlChar "rsquo") +                              DoubleQuote -> (primHtmlChar "ldquo",  +                                              primHtmlChar "rdquo") +                        in  do contents <- inlineListToHtml opts lst +                               return $ leftQuote +++ contents +++ rightQuote +    (TeX str)        -> (if writerUseASCIIMathML opts +                            then modify (\st -> st {stMath = True}) +                            else return ()) >> return (stringToHtml str) +    (HtmlInline str) -> return $ primHtml str  +    (Link [Code str] (src,tit)) | "mailto:" `isPrefixOf` src -> +                        return $ obfuscateLink opts str src +    (Link txt (src,tit)) | "mailto:" `isPrefixOf` src -> do +                        linkText <- inlineListToHtml opts txt   +                        return $ obfuscateLink opts (show linkText) src +    (Link txt (src,tit)) -> do +                        linkText <- inlineListToHtml opts txt +                        return $ anchor ! ([href src] ++  +                                 if null tit then [] else [title tit]) $  +                                 linkText +    (Image txt (source,tit)) -> do +                        alternate <- inlineListToHtml opts txt +                        let alternate' = renderFragment opts alternate +                        let attributes = [src source] ++ +                                         (if null tit  +                                            then []  +                                            else [title tit]) ++  +                                         if null txt  +                                            then []  +                                            else [alt alternate'] +                        return $ image ! attributes  +                        -- note:  null title included, as in Markdown.pl  +    (Note contents)          -> do  +                        st <- get +                        let notes = stNotes st +                        let number = (length notes) + 1 +                        let ref = show number +                        htmlContents <- blockListToNote opts ref contents  +                        -- push contents onto front of notes +                        put $ st {stNotes = (htmlContents:notes)}  +                        return $ anchor ! [href ("#fn" ++ ref), +                                          theclass "footnoteRef", +                                          identifier ("fnref" ++ ref)] <<  +                                          sup << ref + +blockListToNote :: WriterOptions -> String -> [Block] -> State WriterState Html +blockListToNote opts ref blocks = +  -- If last block is Para or Plain, include the backlink at the end of +  -- that block. Otherwise, insert a new Plain block with the backlink. +  let backlink = [HtmlInline $ " <a href=\"#fnref" ++ ref ++  +                 "\" class=\"footnoteBackLink\"" ++ +                 " title=\"Jump back to footnote " ++ ref ++ "\">↩</a>"] +      blocks'  = if null blocks +                    then [] +                    else let lastBlock   = last blocks +                             otherBlocks = init blocks +                         in  case lastBlock of +                                  (Para lst)  -> otherBlocks ++ +                                                 [Para (lst ++ backlink)] +                                  (Plain lst) -> otherBlocks ++ +                                                 [Plain (lst ++ backlink)] +                                  _           -> otherBlocks ++ [lastBlock, +                                                 Plain backlink] +  in  do contents <- blockListToHtml opts blocks' +         return $ li ! [identifier ("fn" ++ ref)] $ contents + diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs new file mode 100644 index 000000000..f64e06e24 --- /dev/null +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -0,0 +1,310 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Writers.LaTeX +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha  +   Portability : portable + +Conversion of 'Pandoc' format into LaTeX. +-} +module Text.Pandoc.Writers.LaTeX ( writeLaTeX ) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Printf ( printf ) +import Data.List ( (\\), isInfixOf, isSuffixOf, intersperse ) +import Data.Char ( toLower ) +import qualified Data.Set as S +import Control.Monad.State +import Text.PrettyPrint.HughesPJ hiding ( Str ) + +data WriterState =  +  WriterState { stIncludes :: S.Set String -- strings to include in header +              , stInNote   :: Bool         -- @True@ if we're in a note +              , stOLLevel  :: Int }        -- level of ordered list nesting  + +-- | Add line to header. +addToHeader :: String -> State WriterState () +addToHeader str = do +  st <- get +  let includes = stIncludes st +  put st {stIncludes = S.insert str includes} + +-- | Convert Pandoc to LaTeX. +writeLaTeX :: WriterOptions -> Pandoc -> String +writeLaTeX options document =  +  render $ evalState (pandocToLaTeX options document) $  +  WriterState { stIncludes = S.empty, stInNote = False, stOLLevel = 1 }  + +pandocToLaTeX :: WriterOptions -> Pandoc -> State WriterState Doc +pandocToLaTeX options (Pandoc meta blocks) = do +  main     <- blockListToLaTeX blocks +  head     <- if writerStandalone options +                 then latexHeader options meta +                 else return empty +  let before = if null (writerIncludeBefore options) +                  then empty +                  else text (writerIncludeBefore options) +  let after  = if null (writerIncludeAfter options) +                  then empty +                  else text (writerIncludeAfter options) +  let body = before $$ main $$ after +  let toc  =  if writerTableOfContents options +                 then text "\\tableofcontents\n" +                 else empty  +  let foot = if writerStandalone options +                then text "\\end{document}" +                else empty  +  return $ head $$ toc $$ body $$ foot + +-- | Insert bibliographic information into LaTeX header. +latexHeader :: WriterOptions -- ^ Options, including LaTeX header +            -> Meta          -- ^ Meta with bibliographic information +            -> State WriterState Doc +latexHeader options (Meta title authors date) = do +  titletext <- if null title +                  then return empty +                  else inlineListToLaTeX title >>= return . inCmd "title" +  headerIncludes <- get >>= return . S.toList . stIncludes +  let extras = text $ unlines headerIncludes +  let verbatim  = if "\\usepackage{fancyvrb}" `elem` headerIncludes +                     then text "\\VerbatimFootnotes % allows verbatim text in footnotes" +                     else empty +  let authorstext = text $ "\\author{" ++  +                    joinWithSep "\\\\" (map stringToLaTeX authors) ++ "}" +  let datetext  = if date == "" +                     then empty  +                     else text $ "\\date{" ++ stringToLaTeX date ++ "}" +  let maketitle = if null title then empty else text "\\maketitle" +  let secnumline = if (writerNumberSections options) +                      then empty  +                      else text "\\setcounter{secnumdepth}{0}" +  let baseHeader = text $ writerHeader options +  let header     = baseHeader $$ extras +  return $ header $$ secnumline $$ verbatim $$ titletext $$ authorstext $$ +           datetext $$ text "\\begin{document}" $$ maketitle $$ text "" + +-- escape things as needed for LaTeX + +stringToLaTeX :: String -> String +stringToLaTeX = escapeStringUsing latexEscapes +  where latexEscapes = backslashEscapes "{}$%&_#" ++  +                       [ ('^', "\\^{}") +                       , ('\\', "\\textbackslash{}") +                       , ('~', "\\ensuremath{\\sim}") +                       , ('|', "\\textbar{}") +                       , ('<', "\\textless{}") +                       , ('>', "\\textgreater{}") +                       ] + +-- | Puts contents into LaTeX command. +inCmd :: String -> Doc -> Doc +inCmd cmd contents = char '\\' <> text cmd <> braces contents + +-- | Remove all code elements from list of inline elements +-- (because it's illegal to have verbatim inside some command arguments) +deVerb :: [Inline] -> [Inline] +deVerb [] = [] +deVerb ((Code str):rest) =  +  (TeX $ "\\texttt{" ++ stringToLaTeX str ++ "}"):(deVerb rest) +deVerb (other:rest) = other:(deVerb rest) + +-- | Convert Pandoc block element to LaTeX. +blockToLaTeX :: Block     -- ^ Block to convert +             -> State WriterState Doc +blockToLaTeX Null = return empty +blockToLaTeX (Plain lst) = wrapped inlineListToLaTeX lst >>= return  +blockToLaTeX (Para lst) =  +  wrapped inlineListToLaTeX lst >>= return . (<> char '\n') +blockToLaTeX (BlockQuote lst) = do +  contents <- blockListToLaTeX lst +  return $ text "\\begin{quote}" $$ contents $$ text "\\end{quote}" +blockToLaTeX (CodeBlock str) = do +  st <- get +  env <- if stInNote st +            then do addToHeader "\\usepackage{fancyvrb}" +                    return "Verbatim" +            else return "verbatim" +  return $ text ("\\begin{" ++ env ++ "}\n") <> text str <>  +           text ("\n\\end{" ++ env ++ "}") +blockToLaTeX (RawHtml str) = return empty +blockToLaTeX (BulletList lst) = do +  items <- mapM listItemToLaTeX lst +  return $ text "\\begin{itemize}" $$ vcat items $$ text "\\end{itemize}" +blockToLaTeX (OrderedList (start, numstyle, numdelim) lst) = do +  st <- get +  let oldlevel = stOLLevel st +  put $ st {stOLLevel = oldlevel + 1} +  items <- mapM listItemToLaTeX lst +  modify (\st -> st {stOLLevel = oldlevel}) +  exemplar <- if numstyle /= DefaultStyle || numdelim /= DefaultDelim +                 then do addToHeader "\\usepackage{enumerate}" +                         return $ char '[' <>  +                                  text (head (orderedListMarkers (1, numstyle, +                                  numdelim))) <> char ']' +                 else return empty +  let resetcounter = if start /= 1 && oldlevel <= 4 +                        then text $ "\\setcounter{enum" ++  +                             map toLower (toRomanNumeral oldlevel) ++ +                             "}{" ++ show (start - 1) ++ "}" +                        else empty  +  return $ text "\\begin{enumerate}" <> exemplar $$ resetcounter $$ +           vcat items $$ text "\\end{enumerate}" +blockToLaTeX (DefinitionList lst) = do +  items <- mapM defListItemToLaTeX lst +  return $ text "\\begin{description}" $$ vcat items $$ +           text "\\end{description}" +blockToLaTeX HorizontalRule = return $ text $ +    "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n" +blockToLaTeX (Header level lst) = do +  txt <- inlineListToLaTeX (deVerb lst) +  return $ if (level > 0) && (level <= 3) +              then text ("\\" ++ (concat (replicate (level - 1) "sub")) ++  +                   "section{") <> txt <> text "}\n" +              else txt <> char '\n' +blockToLaTeX (Table caption aligns widths heads rows) = do +  headers <- tableRowToLaTeX heads +  captionText <- inlineListToLaTeX caption +  rows' <- mapM tableRowToLaTeX rows +  let colWidths = map (printf "%.2f") widths +  let colDescriptors = concat $ zipWith +                                (\width align -> ">{\\PBS" ++  +                                (case align of  +                                       AlignLeft -> "\\raggedright" +                                       AlignRight -> "\\raggedleft" +                                       AlignCenter -> "\\centering" +                                       AlignDefault -> "\\raggedright") ++ +                                "\\hspace{0pt}}p{" ++ width ++  +                                "\\columnwidth}") +                                colWidths aligns +  let tableBody = text ("\\begin{tabular}{" ++ colDescriptors ++ "}") $$ +                  headers $$ text "\\hline" $$ vcat rows' $$  +                  text "\\end{tabular}"  +  let centered txt = text "\\begin{center}" $$ txt $$ text "\\end{center}" +  addToHeader "\\usepackage{array}\n\ +    \% This is needed because raggedright in table elements redefines \\\\:\n\ +    \\\newcommand{\\PreserveBackslash}[1]{\\let\\temp=\\\\#1\\let\\\\=\\temp}\n\ +    \\\let\\PBS=\\PreserveBackslash" +  return $ if isEmpty captionText +              then centered tableBody <> char '\n' +              else text "\\begin{table}[h]" $$ centered tableBody $$  +                   inCmd "caption" captionText $$ text "\\end{table}\n"  + +blockListToLaTeX lst = mapM blockToLaTeX lst >>= return . vcat + +tableRowToLaTeX cols = mapM blockListToLaTeX cols >>=  +  return . ($$ text "\\\\") . foldl (\row item -> row $$ +  (if isEmpty row then empty else text " & ") <> item) empty + +listItemToLaTeX lst = blockListToLaTeX lst >>= return .  (text "\\item " $$) . +                      (nest 2) + +defListItemToLaTeX (term, def) = do +    term' <- inlineListToLaTeX $ deVerb term +    def'  <- blockListToLaTeX def +    return $ text "\\item[" <> term' <> text "]" $$ def' + +-- | Convert list of inline elements to LaTeX. +inlineListToLaTeX :: [Inline]  -- ^ Inlines to convert +                  -> State WriterState Doc +inlineListToLaTeX lst = mapM inlineToLaTeX lst >>= return . hcat + +isQuoted :: Inline -> Bool +isQuoted (Quoted _ _) = True +isQuoted Apostrophe = True +isQuoted _ = False + +-- | Convert inline element to LaTeX +inlineToLaTeX :: Inline    -- ^ Inline to convert +              -> State WriterState Doc +inlineToLaTeX (Emph lst) = +  inlineListToLaTeX (deVerb lst) >>= return . inCmd "emph" +inlineToLaTeX (Strong lst) =  +  inlineListToLaTeX (deVerb lst) >>= return . inCmd "textbf"  +inlineToLaTeX (Strikeout lst) = do +  contents <- inlineListToLaTeX $ deVerb lst +  addToHeader "\\usepackage[normalem]{ulem}" +  return $ inCmd "sout" contents +inlineToLaTeX (Superscript lst) =  +  inlineListToLaTeX (deVerb lst) >>= return . inCmd "textsuperscript" +inlineToLaTeX (Subscript lst) = do +  contents <- inlineListToLaTeX $ deVerb lst +  -- oddly, latex includes \textsuperscript but not \textsubscript +  -- so we have to define it: +  addToHeader "\\newcommand{\\textsubscript}[1]{\\ensuremath{_{\\scriptsize\\textrm{#1}}}}" +  return $ inCmd "textsubscript" contents +inlineToLaTeX (Code str) = do +  st <- get +  if stInNote st +     then do addToHeader "\\usepackage{fancyvrb}" +     else return () +  let chr = ((enumFromTo '!' '~') \\ str) !! 0 +  return $ text $ "\\verb" ++ [chr] ++ str ++ [chr] +inlineToLaTeX (Quoted SingleQuote lst) = do +  contents <- inlineListToLaTeX lst +  let s1 = if (not (null lst)) && (isQuoted (head lst)) +              then text "\\," +              else empty  +  let s2 = if (not (null lst)) && (isQuoted (last lst)) +              then text "\\," +              else empty +  return $ char '`' <> s1 <> contents <> s2 <> char '\'' +inlineToLaTeX (Quoted DoubleQuote lst) = do +  contents <- inlineListToLaTeX lst +  let s1 = if (not (null lst)) && (isQuoted (head lst)) +              then text "\\," +              else empty  +  let s2 = if (not (null lst)) && (isQuoted (last lst)) +              then text "\\," +              else empty +  return $ text "``" <> s1 <> contents <> s2 <> text "''" +inlineToLaTeX Apostrophe = return $ char '\'' +inlineToLaTeX EmDash = return $ text "---" +inlineToLaTeX EnDash = return $ text "--" +inlineToLaTeX Ellipses = return $ text "\\ldots{}" +inlineToLaTeX (Str str) = return $ text $ stringToLaTeX str +inlineToLaTeX (TeX str) = return $ text str +inlineToLaTeX (HtmlInline str) = return empty +inlineToLaTeX (LineBreak) = return $ text "\\\\"  +inlineToLaTeX Space = return $ char ' ' +inlineToLaTeX (Link txt (src, _)) = do +  addToHeader "\\usepackage[breaklinks=true]{hyperref}" +  case txt of +        [Code x] | x == src ->  -- autolink +             do addToHeader "\\usepackage{url}"  +                return $ text $ "\\url{" ++ x ++ "}" +        _ -> do contents <- inlineListToLaTeX $ deVerb txt +                return $ text ("\\href{" ++ src ++ "}{") <> contents <>  +                         char '}' +inlineToLaTeX (Image alternate (source, tit)) = do +  addToHeader "\\usepackage{graphicx}" +  return $ text $ "\\includegraphics{" ++ source ++ "}"  +inlineToLaTeX (Note contents) = do +  st <- get +  put (st {stInNote = True}) +  contents' <- blockListToLaTeX contents +  modify (\st -> st {stInNote = False}) +  let rawnote = stripTrailingNewlines $ render contents' +  -- note: a \n before } is needed when note ends with a Verbatim environment +  let optNewline = "\\end{Verbatim}" `isSuffixOf` rawnote +  return $ text "%\n\\footnote{" <>  +           text rawnote <> (if optNewline then char '\n' else empty) <> char '}' diff --git a/src/Text/Pandoc/Writers/Man.hs b/src/Text/Pandoc/Writers/Man.hs new file mode 100644 index 000000000..8e14c2bf0 --- /dev/null +++ b/src/Text/Pandoc/Writers/Man.hs @@ -0,0 +1,293 @@ +{- +Copyright (C) 2007 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Writers.Man  +   Copyright   : Copyright (C) 2007 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +Conversion of 'Pandoc' documents to groff man page format. + +-} +module Text.Pandoc.Writers.Man ( writeMan) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared  +import Text.Printf ( printf ) +import Data.List ( isPrefixOf, drop, nub, intersperse ) +import Text.PrettyPrint.HughesPJ hiding ( Str ) +import Control.Monad.State + +type Notes = [[Block]] +type Preprocessors = [String] -- e.g. "t" for tbl +type WriterState = (Notes, Preprocessors) + +-- | Convert Pandoc to Man. +writeMan :: WriterOptions -> Pandoc -> String +writeMan opts document = render $ evalState (pandocToMan opts document) ([],[])  + +-- | Return groff man representation of document. +pandocToMan :: WriterOptions -> Pandoc -> State WriterState Doc +pandocToMan opts (Pandoc meta blocks) = do +  let before  = writerIncludeBefore opts +  let after   = writerIncludeAfter opts +  let before' = if null before then empty else text before +  let after'  = if null after then empty else text after +  (head, foot) <- metaToMan opts meta +  body <- blockListToMan opts blocks +  (notes, preprocessors) <- get +  let preamble = if null preprocessors || not (writerStandalone opts) +                    then empty +                    else text $ ".\\\" " ++ concat (nub preprocessors) +  notes' <- notesToMan opts (reverse notes) +  return $ preamble $$ head $$ before' $$ body $$ notes' $$ foot $$ after' + +-- | Insert bibliographic information into Man header and footer. +metaToMan :: WriterOptions -- ^ Options, including Man header +          -> Meta          -- ^ Meta with bibliographic information +          -> State WriterState (Doc, Doc) +metaToMan options (Meta title authors date) = do +  titleText <- inlineListToMan options title +  let (cmdName, rest) = break (== ' ') $ render titleText +  let (title', section) = case reverse cmdName of +                            (')':d:'(':xs) | d `elem` ['0'..'9'] ->  +                                  (text (reverse xs), char d) +                            xs -> (text (reverse xs), doubleQuotes empty)                     +  let extras = map (doubleQuotes . text . removeLeadingTrailingSpace) $ +               splitBy '|' rest +  let head = (text ".TH") <+> title' <+> section <+>  +             doubleQuotes (text date) <+> hsep extras +  let foot = case length authors of +                0 -> empty +                1 -> text ".SH AUTHOR" $$ (text $ joinWithSep ", " authors) +                2 -> text ".SH AUTHORS" $$ (text $ joinWithSep ", " authors) +  return $ if writerStandalone options +              then (head, foot) +              else (empty, empty) + +-- | Return man representation of notes. +notesToMan :: WriterOptions -> [[Block]] -> State WriterState Doc +notesToMan opts notes = +  if null notes +     then return empty +     else mapM (\(num, note) -> noteToMan opts num note) (zip [1..] notes) >>=  +          return . (text ".SH NOTES" $$) . vcat + +-- | Return man representation of a note. +noteToMan :: WriterOptions -> Int -> [Block] -> State WriterState Doc +noteToMan opts num note = do +  contents <- blockListToMan opts note +  let marker = text "\n.SS [" <> text (show num) <> char ']' +  return $ marker $$ contents  + +-- | Association list of characters to escape. +manEscapes :: [(Char, String)] +manEscapes = [('\160', "\\ "), ('\'', "\\[aq]")] ++ backslashEscapes "\".@\\" + +-- | Escape special characters for Man. +escapeString :: String -> String +escapeString = escapeStringUsing manEscapes + +-- | Escape a literal (code) section for Man. +escapeCode :: String -> String +escapeCode = escapeStringUsing (manEscapes ++ backslashEscapes "\t ") + +-- | Convert Pandoc block element to man. +blockToMan :: WriterOptions -- ^ Options +                -> Block         -- ^ Block element +                -> State WriterState Doc  +blockToMan opts Null = return empty +blockToMan opts (Plain inlines) =  +  wrapIfNeeded opts (inlineListToMan opts) inlines +blockToMan opts (Para inlines) = do +  contents <- wrapIfNeeded opts (inlineListToMan opts) inlines +  return $ text ".PP" $$ contents  +blockToMan opts (RawHtml str) = return $ text str +blockToMan opts HorizontalRule = return $ text $ ".PP\n   *   *   *   *   *" +blockToMan opts (Header level inlines) = do +  contents <- inlineListToMan opts inlines +  let heading = case level of +                  1 -> ".SH " +                  _ -> ".SS " +  return $ text heading <> contents  +blockToMan opts (CodeBlock str) = return $ +  text ".PP" $$ text "\\f[CR]" $$  +  text ((unlines . map ("      " ++) . lines) (escapeCode str)) <> text "\\f[]" +blockToMan opts (BlockQuote blocks) = do   +  contents <- blockListToMan opts blocks +  return $ text ".RS" $$ contents $$ text ".RE" +blockToMan opts (Table caption alignments widths headers rows) =  +  let aligncode AlignLeft    = "l" +      aligncode AlignRight   = "r" +      aligncode AlignCenter  = "c" +      aligncode AlignDefault = "l" +  in do +  caption' <- inlineListToMan opts caption +  modify (\(notes, preprocessors) -> (notes, "t":preprocessors)) +  let iwidths = map (printf "w(%0.2fn)" . (70 *)) widths  +  -- 78n default width - 8n indent = 70n +  let coldescriptions = text $ joinWithSep " "  +                        (zipWith (\align width -> aligncode align ++ width)  +                        alignments iwidths) ++ "." +  colheadings <- mapM (blockListToMan opts) headers +  let makeRow cols = text "T{" $$  +                     (vcat $ intersperse (text "T}@T{") cols) $$  +                     text "T}" +  let colheadings' = makeRow colheadings +  body <- mapM (\row -> do  +                         cols <- mapM (blockListToMan opts) row +                         return $ makeRow cols) rows +  return $ text ".PP" $$ caption' $$  +           text ".TS" $$ text "tab(@);" $$ coldescriptions $$  +           colheadings' $$ char '_' $$ vcat body $$ text ".TE" + +blockToMan opts (BulletList items) = do +  contents <- mapM (bulletListItemToMan opts) items +  return (vcat contents)  +blockToMan opts (OrderedList attribs items) = do +  let markers = take (length items) $ orderedListMarkers attribs   +  let indent = 1 + (maximum $ map length markers) +  contents <- mapM (\(num, item) -> orderedListItemToMan opts num indent item) $ +              zip markers items   +  return (vcat contents) +blockToMan opts (DefinitionList items) = do   +  contents <- mapM (definitionListItemToMan opts) items +  return (vcat contents) + +-- | Convert bullet list item (list of blocks) to man. +bulletListItemToMan :: WriterOptions -> [Block] -> State WriterState Doc +bulletListItemToMan opts [] = return empty +bulletListItemToMan opts ((Para first):rest) =  +  bulletListItemToMan opts ((Plain first):rest) +bulletListItemToMan opts ((Plain first):rest) = do +  first' <- blockToMan opts (Plain first)  +  rest' <- blockListToMan opts rest +  let first'' = text ".IP \\[bu] 2" $$ first' +  let rest''  = if null rest +                   then empty +                   else text ".RS 2" $$ rest' $$ text ".RE" +  return (first'' $$ rest'')  +bulletListItemToMan opts (first:rest) = do +  first' <- blockToMan opts first +  rest' <- blockListToMan opts rest +  return $ text "\\[bu] .RS 2" $$ first' $$ rest' $$ text ".RE" +  +-- | Convert ordered list item (a list of blocks) to man. +orderedListItemToMan :: WriterOptions -- ^ options +                          -> String   -- ^ order marker for list item +                          -> Int      -- ^ number of spaces to indent +                          -> [Block]  -- ^ list item (list of blocks) +                          -> State WriterState Doc +orderedListItemToMan _ _ _ [] = return empty +orderedListItemToMan opts num indent ((Para first):rest) =  +  orderedListItemToMan opts num indent ((Plain first):rest) +orderedListItemToMan opts num indent (first:rest) = do +  first' <- blockToMan opts first +  rest' <- blockListToMan opts rest +  let num' = printf ("%" ++ show (indent - 1) ++ "s") num +  let first'' = text (".IP \"" ++ num' ++ "\" " ++ show indent) $$ first' +  let rest''  = if null rest +                   then empty +                   else text ".RS 4" $$ rest' $$ text ".RE" +  return $ first'' $$ rest''  + +-- | Convert definition list item (label, list of blocks) to man. +definitionListItemToMan :: WriterOptions +                             -> ([Inline],[Block])  +                             -> State WriterState Doc +definitionListItemToMan opts (label, items) = do +  labelText <- inlineListToMan opts label +  contents <- if null items +                 then return empty +                 else do  +                        let (first, rest) = case items of +                             ((Para x):y) -> (Plain x,y) +                             (x:y)        -> (x,y) +                        rest' <- mapM (\item -> blockToMan opts item) +                                 rest >>= (return . vcat) +                        first' <- blockToMan opts first +                        return $ first' $$ text ".RS" $$ rest' $$ text ".RE" +  return $ text ".TP\n.B " <> labelText $+$ contents + +-- | Convert list of Pandoc block elements to man. +blockListToMan :: WriterOptions -- ^ Options +                    -> [Block]       -- ^ List of block elements +                    -> State WriterState Doc  +blockListToMan opts blocks = +  mapM (blockToMan opts) blocks >>= (return . vcat) + +-- | Convert list of Pandoc inline elements to man. +inlineListToMan :: WriterOptions -> [Inline] -> State WriterState Doc +inlineListToMan opts lst = mapM (inlineToMan opts) lst >>= (return . hcat) + +-- | Convert Pandoc inline element to man. +inlineToMan :: WriterOptions -> Inline -> State WriterState Doc +inlineToMan opts (Emph lst) = do  +  contents <- inlineListToMan opts lst +  return $ text "\\f[I]" <> contents <> text "\\f[]" +inlineToMan opts (Strong lst) = do +  contents <- inlineListToMan opts lst +  return $ text "\\f[B]" <> contents <> text "\\f[]" +inlineToMan opts (Strikeout lst) = do +  contents <- inlineListToMan opts lst +  return $ text "[STRIKEOUT:" <> contents <> char ']' +inlineToMan opts (Superscript lst) = do +  contents <- inlineListToMan opts lst +  return $ char '^' <> contents <> char '^' +inlineToMan opts (Subscript lst) = do +  contents <- inlineListToMan opts lst +  return $ char '~' <> contents <> char '~' +inlineToMan opts (Quoted SingleQuote lst) = do +  contents <- inlineListToMan opts lst +  return $ char '`' <> contents <> char '\'' +inlineToMan opts (Quoted DoubleQuote lst) = do +  contents <- inlineListToMan opts lst +  return $ text "\\[lq]" <> contents <> text "\\[rq]" +inlineToMan opts EmDash = return $ text "\\[em]" +inlineToMan opts EnDash = return $ text "\\[en]" +inlineToMan opts Apostrophe = return $ char '\'' +inlineToMan opts Ellipses = return $ text "\\&..." +inlineToMan opts (Code str) = +  return $ text $ "\\f[B]" ++ escapeCode str ++ "\\f[]" +inlineToMan opts (Str str) = return $ text $ escapeString str +inlineToMan opts (TeX str) = return $ text $ escapeCode str +inlineToMan opts (HtmlInline str) = return $ text $ escapeCode str  +inlineToMan opts (LineBreak) = return $ text "\n.PD 0\n.P\n.PD\n" +inlineToMan opts Space = return $ char ' ' +inlineToMan opts (Link txt (src, _)) = do +  linktext <- inlineListToMan opts txt +  let srcSuffix = if isPrefixOf "mailto:" src then drop 7 src else src +  return $ if txt == [Code srcSuffix] +              then char '<' <> text srcSuffix <> char '>'  +              else linktext <> text " (" <> text src <> char ')'  +inlineToMan opts (Image alternate (source, tit)) = do +  let txt = if (null alternate) || (alternate == [Str ""]) ||  +               (alternate == [Str source]) -- to prevent autolinks +               then [Str "image"] +               else alternate +  linkPart <- inlineToMan opts (Link txt (source, tit))  +  return $ char '[' <> text "IMAGE: " <> linkPart <> char ']' +inlineToMan opts (Note contents) = do  +  modify (\(notes, prep) -> (contents:notes, prep)) -- add to notes in state +  (notes, _) <- get +  let ref = show $ (length notes) +  return $ char '[' <> text ref <> char ']' + diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs new file mode 100644 index 000000000..4cecaae5d --- /dev/null +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -0,0 +1,373 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Writers.Markdown  +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +Conversion of 'Pandoc' documents to markdown-formatted plain text. + +Markdown:  <http://daringfireball.net/projects/markdown/> +-} +module Text.Pandoc.Writers.Markdown ( writeMarkdown) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared  +import Text.Pandoc.Blocks +import Text.ParserCombinators.Parsec ( parse, (<|>), GenParser ) +import Data.List ( group, isPrefixOf, drop, find, intersperse ) +import Text.PrettyPrint.HughesPJ hiding ( Str ) +import Control.Monad.State + +type Notes = [[Block]] +type Refs = KeyTable +type WriterState = (Notes, Refs) + +-- | Convert Pandoc to Markdown. +writeMarkdown :: WriterOptions -> Pandoc -> String +writeMarkdown opts document =  +  render $ evalState (pandocToMarkdown opts document) ([],[])  + +-- | Return markdown representation of document. +pandocToMarkdown :: WriterOptions -> Pandoc -> State WriterState Doc +pandocToMarkdown opts (Pandoc meta blocks) = do +  let before  = writerIncludeBefore opts +  let after   = writerIncludeAfter opts +  let before' = if null before then empty else text before +  let after'  = if null after then empty else text after +  metaBlock <- metaToMarkdown opts meta +  let head = if writerStandalone opts +                then metaBlock $+$ text (writerHeader opts) +                else empty +  let headerBlocks = filter isHeaderBlock blocks +  let toc = if writerTableOfContents opts  +               then tableOfContents opts headerBlocks +               else empty +  body <- blockListToMarkdown opts blocks +  (notes, _) <- get +  notes' <- notesToMarkdown opts (reverse notes) +  (_, refs) <- get  -- note that the notes may contain refs +  refs' <- keyTableToMarkdown opts (reverse refs) +  return $ head $+$ before' $+$ toc $+$ body $+$ text "" $+$  +           notes' $+$ text "" $+$ refs' $+$ after' + +-- | Return markdown representation of reference key table. +keyTableToMarkdown :: WriterOptions -> KeyTable -> State WriterState Doc +keyTableToMarkdown opts refs = mapM (keyToMarkdown opts) refs >>= return . vcat +  +-- | Return markdown representation of a reference key.  +keyToMarkdown :: WriterOptions  +              -> ([Inline], (String, String))  +              -> State WriterState Doc +keyToMarkdown opts (label, (src, tit)) = do +  label' <- inlineListToMarkdown opts label +  let tit' = if null tit then empty else text $ " \"" ++ tit ++ "\"" +  return $ text "  " <> char '[' <> label' <> char ']' <> text ": " <> +           text src <> tit'  + +-- | Return markdown representation of notes. +notesToMarkdown :: WriterOptions -> [[Block]] -> State WriterState Doc +notesToMarkdown opts notes =  +  mapM (\(num, note) -> noteToMarkdown opts num note) (zip [1..] notes) >>=  +  return . vcat + +-- | Return markdown representation of a note. +noteToMarkdown :: WriterOptions -> Int -> [Block] -> State WriterState Doc +noteToMarkdown opts num blocks = do +  contents  <- blockListToMarkdown opts blocks +  let marker = text "[^" <> text (show num) <> text "]:" +  return $ hang marker (writerTabStop opts) contents  + +-- | Escape special characters for Markdown. +escapeString :: String -> String +escapeString = escapeStringUsing markdownEscapes +  where markdownEscapes = ('\160', " "):(backslashEscapes "`<\\*_^~") + +-- | Convert bibliographic information into Markdown header. +metaToMarkdown :: WriterOptions -> Meta -> State WriterState Doc +metaToMarkdown opts (Meta title authors date) = do +  title'   <- titleToMarkdown opts title +  authors' <- authorsToMarkdown authors +  date'    <- dateToMarkdown date +  return $ title' $+$ authors' $+$ date' + +titleToMarkdown :: WriterOptions -> [Inline] -> State WriterState Doc +titleToMarkdown opts [] = return empty +titleToMarkdown opts lst = do +  contents <- inlineListToMarkdown opts lst +  return $ text "% " <> contents  + +authorsToMarkdown :: [String] -> State WriterState Doc +authorsToMarkdown [] = return empty +authorsToMarkdown lst = return $  +  text "% " <> text (joinWithSep ", " (map escapeString lst)) + +dateToMarkdown :: String -> State WriterState Doc +dateToMarkdown [] = return empty +dateToMarkdown str = return $ text "% " <> text (escapeString str) + +-- | Construct table of contents from list of header blocks. +tableOfContents :: WriterOptions -> [Block] -> Doc  +tableOfContents opts headers = +  let opts' = opts { writerIgnoreNotes = True } +      contents = BulletList $ map elementToListItem $ hierarchicalize headers +  in  evalState (blockToMarkdown opts' contents) ([],[]) + +-- | Converts an Element to a list item for a table of contents, +elementToListItem :: Element -> [Block] +elementToListItem (Blk _) = [] +elementToListItem (Sec headerText subsecs) = [Plain headerText] ++  +  if null subsecs +     then [] +     else [BulletList $ map elementToListItem subsecs] + +-- | Ordered list start parser for use in Para below. +olMarker :: GenParser Char st Char +olMarker = do (start, style, delim) <- anyOrderedListMarker +              if delim == Period &&  +                          (style == UpperAlpha || (style == UpperRoman && +                          start `elem` [1, 5, 10, 50, 100, 500, 1000])) +                          then spaceChar >> spaceChar +                          else spaceChar + +-- | True if string begins with an ordered list marker +beginsWithOrderedListMarker :: String -> Bool +beginsWithOrderedListMarker str =  +  case parse olMarker "para start" str of +         Left  _  -> False  +         Right _  -> True + +wrappedMarkdown :: WriterOptions -> [Inline] -> State WriterState Doc +wrappedMarkdown opts inlines = do +  let chunks  = splitBy LineBreak inlines +  let chunks' = if null chunks +                   then [] +                   else (map (++ [Str "  "]) $ init chunks) ++ [last chunks] +  lns <- mapM (wrapIfNeeded opts (inlineListToMarkdown opts)) chunks' +  return $ vcat lns + +-- | Convert Pandoc block element to markdown. +blockToMarkdown :: WriterOptions -- ^ Options +                -> Block         -- ^ Block element +                -> State WriterState Doc  +blockToMarkdown opts Null = return empty +blockToMarkdown opts (Plain inlines) =  +  wrappedMarkdown opts inlines +blockToMarkdown opts (Para inlines) = do +  contents <- wrappedMarkdown opts inlines +  -- escape if para starts with ordered list marker +  let esc = if (not (writerStrictMarkdown opts)) &&  +               beginsWithOrderedListMarker (render contents) +               then char '\\' +               else empty  +  return $ esc <> contents <> text "\n" +blockToMarkdown opts (RawHtml str) = return $ text str +blockToMarkdown opts HorizontalRule = return $ text "\n* * * * *\n" +blockToMarkdown opts (Header level inlines) = do +  contents <- inlineListToMarkdown opts inlines +  return $ text ((replicate level '#') ++ " ") <> contents <> text "\n" +blockToMarkdown opts (CodeBlock str) = return $ +  (nest (writerTabStop opts) $ vcat $ map text (lines str)) <> text "\n" +blockToMarkdown opts (BlockQuote blocks) = do +  contents <- blockListToMarkdown opts blocks +  return $ (vcat $ map (text . ("> " ++)) $ lines $ render contents) <>  +           text "\n" +blockToMarkdown opts (Table caption aligns widths headers rows) =  do +  caption' <- inlineListToMarkdown opts caption +  let caption'' = if null caption +                     then empty +                     else text "" $+$ (text "Table: " <> caption') +  headers' <- mapM (blockListToMarkdown opts) headers +  let widthsInChars = map (floor . (78 *)) widths +  let alignHeader alignment = case alignment of +                                AlignLeft    -> leftAlignBlock +                                AlignCenter  -> centerAlignBlock +                                AlignRight   -> rightAlignBlock +                                AlignDefault -> leftAlignBlock   +  let makeRow = hsepBlocks . (zipWith alignHeader aligns) .  +                (zipWith docToBlock widthsInChars) +  let head = makeRow headers' +  rows' <- mapM (\row -> do cols <- mapM (blockListToMarkdown opts) row +                            return $ makeRow cols) rows +  let tableWidth = sum widthsInChars +  let maxRowHeight = maximum $ map heightOfBlock (head:rows') +  let isMultilineTable = maxRowHeight > 1 +  let underline = hsep $  +                  map (\width -> text $ replicate width '-') widthsInChars +  let border = if isMultilineTable +                  then text $ replicate (sum widthsInChars + (length widthsInChars - 1)) '-' +                  else empty +  let spacer = if isMultilineTable +                  then text "" +                  else empty +  let body = vcat $ intersperse spacer $ map blockToDoc rows' +  return $ (nest 2 $ border $+$ (blockToDoc head) $+$ underline $+$ body $+$  +                     border $+$ caption'') <> text "\n" +blockToMarkdown opts (BulletList items) = do +  contents <- mapM (bulletListItemToMarkdown opts) items +  return $ (vcat contents) <> text "\n" +blockToMarkdown opts (OrderedList attribs items) = do +  let markers  = orderedListMarkers attribs +  let markers' = map (\m -> if length m < 3 +                               then m ++ replicate (3 - length m) ' ' +                               else m) markers  +  contents <- mapM (\(item, num) -> orderedListItemToMarkdown opts item num) $ +              zip markers' items   +  return $ (vcat contents) <> text "\n" +blockToMarkdown opts (DefinitionList items) = do +  contents <- mapM (definitionListItemToMarkdown opts) items +  return $ (vcat contents) <> text "\n" + +-- | Convert bullet list item (list of blocks) to markdown. +bulletListItemToMarkdown :: WriterOptions -> [Block] -> State WriterState Doc +bulletListItemToMarkdown opts items = do +  contents <- blockListToMarkdown opts items +  return $ hang (text "-  ") (writerTabStop opts) contents + +-- | Convert ordered list item (a list of blocks) to markdown. +orderedListItemToMarkdown :: WriterOptions -- ^ options +                          -> String        -- ^ list item marker +                          -> [Block]       -- ^ list item (list of blocks) +                          -> State WriterState Doc +orderedListItemToMarkdown opts marker items = do +  contents <- blockListToMarkdown opts items +  -- The complexities here are needed to ensure that if the list +  -- marker is 4 characters or longer, the second and following +  -- lines are indented 4 spaces but the list item begins after the marker. +  return $ sep [nest (min (3 - length marker) 0) (text marker),  +                nest (writerTabStop opts) contents] + +-- | Convert definition list item (label, list of blocks) to markdown. +definitionListItemToMarkdown :: WriterOptions +                             -> ([Inline],[Block])  +                             -> State WriterState Doc +definitionListItemToMarkdown opts (label, items) = do +  labelText <- inlineListToMarkdown opts label +  let tabStop = writerTabStop opts +  let leader  = char ':' +  contents <- mapM (\item -> blockToMarkdown opts item >>=  +                   (\txt -> return (leader $$ nest tabStop txt))) +                   items >>= return . vcat +  return $ labelText $+$ contents + +-- | Convert list of Pandoc block elements to markdown. +blockListToMarkdown :: WriterOptions -- ^ Options +                    -> [Block]       -- ^ List of block elements +                    -> State WriterState Doc  +blockListToMarkdown opts blocks = +  mapM (blockToMarkdown opts) blocks >>= return . vcat + +-- | Get reference for target; if none exists, create unique one and return. +--   Prefer label if possible; otherwise, generate a unique key. +getReference :: [Inline] -> Target -> State WriterState [Inline] +getReference label (src, tit) = do +  (_,refs) <- get +  case find ((== (src, tit)) . snd) refs of +    Just (ref, _) -> return ref +    Nothing       -> do +      let label' = case find ((== label) . fst) refs of +                      Just _ -> -- label is used; generate numerical label +                                 case find (\n -> not (any (== [Str (show n)]) +                                           (map fst refs))) [1..10000] of +                                      Just x  -> [Str (show x)] +                                      Nothing -> error "no unique label" +                      Nothing -> label +      modify (\(notes, refs) -> (notes, (label', (src,tit)):refs)) +      return label' + +-- | Convert list of Pandoc inline elements to markdown. +inlineListToMarkdown :: WriterOptions -> [Inline] -> State WriterState Doc +inlineListToMarkdown opts lst = +  mapM (inlineToMarkdown opts) lst >>= return . hcat + +-- | Convert Pandoc inline element to markdown. +inlineToMarkdown :: WriterOptions -> Inline -> State WriterState Doc +inlineToMarkdown opts (Emph lst) = do  +  contents <- inlineListToMarkdown opts lst +  return $ char '*' <> contents <> char '*' +inlineToMarkdown opts (Strong lst) = do +  contents <- inlineListToMarkdown opts lst +  return $ text "**" <> contents <> text "**" +inlineToMarkdown opts (Strikeout lst) = do +  contents <- inlineListToMarkdown opts lst +  return $ text "~~" <> contents <> text "~~" +inlineToMarkdown opts (Superscript lst) = do +  contents <- inlineListToMarkdown opts lst +  let contents' = text $ substitute " " "\\ " $ render contents +  return $ char '^' <> contents' <> char '^' +inlineToMarkdown opts (Subscript lst) = do +  contents <- inlineListToMarkdown opts lst +  let contents' = text $ substitute " " "\\ " $ render contents +  return $ char '~' <> contents' <> char '~' +inlineToMarkdown opts (Quoted SingleQuote lst) = do +  contents <- inlineListToMarkdown opts lst +  return $ char '\'' <> contents <> char '\'' +inlineToMarkdown opts (Quoted DoubleQuote lst) = do +  contents <- inlineListToMarkdown opts lst +  return $ char '"' <> contents <> char '"' +inlineToMarkdown opts EmDash = return $ text "--" +inlineToMarkdown opts EnDash = return $ char '-' +inlineToMarkdown opts Apostrophe = return $ char '\'' +inlineToMarkdown opts Ellipses = return $ text "..." +inlineToMarkdown opts (Code str) = +  let tickGroups = filter (\s -> '`' `elem` s) $ group str  +      longest    = if null tickGroups +                     then 0 +                     else maximum $ map length tickGroups  +      marker     = replicate (longest + 1) '`'  +      spacer     = if (longest == 0) then "" else " " in +  return $ text (marker ++ spacer ++ str ++ spacer ++ marker) +inlineToMarkdown opts (Str str) = return $ text $ escapeString str +inlineToMarkdown opts (TeX str) = return $ text str +inlineToMarkdown opts (HtmlInline str) = return $ text str  +inlineToMarkdown opts (LineBreak) = return $ text "  \n" +inlineToMarkdown opts Space = return $ char ' ' +inlineToMarkdown opts (Link txt (src, tit)) = do +  linktext <- inlineListToMarkdown opts txt +  let linktitle = if null tit then empty else text $ " \"" ++ tit ++ "\"" +  let srcSuffix = if isPrefixOf "mailto:" src then drop 7 src else src +  let useRefLinks = writerReferenceLinks opts +  let useAuto = null tit && txt == [Code srcSuffix] +  ref <- if useRefLinks then getReference txt (src, tit) else return [] +  reftext <- inlineListToMarkdown opts ref +  return $ if useAuto +              then char '<' <> text srcSuffix <> char '>'  +              else if useRefLinks +                      then let first  = char '[' <> linktext <> char ']' +                               second = if txt == ref +                                           then text "[]" +                                           else char '[' <> reftext <> char ']' +                           in  first <> second +                      else char '[' <> linktext <> char ']' <>  +                           char '(' <> text src <> linktitle <> char ')'  +inlineToMarkdown opts (Image alternate (source, tit)) = do +  let txt = if (null alternate) || (alternate == [Str ""]) ||  +               (alternate == [Str source]) -- to prevent autolinks +               then [Str "image"] +               else alternate +  linkPart <- inlineToMarkdown opts (Link txt (source, tit))  +  return $ char '!' <> linkPart +inlineToMarkdown opts (Note contents) = do  +  modify (\(notes, refs) -> (contents:notes, refs)) -- add to notes in state +  (notes, _) <- get +  let ref = show $ (length notes) +  return $ text "[^" <> text ref <> char ']' diff --git a/src/Text/Pandoc/Writers/RST.hs b/src/Text/Pandoc/Writers/RST.hs new file mode 100644 index 000000000..ddcbf95c0 --- /dev/null +++ b/src/Text/Pandoc/Writers/RST.hs @@ -0,0 +1,325 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Writers.RST  +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +Conversion of 'Pandoc' documents to reStructuredText. + +reStructuredText:  <http://docutils.sourceforge.net/rst.html> +-} +module Text.Pandoc.Writers.RST ( writeRST) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared  +import Text.Pandoc.Blocks +import Data.List ( isPrefixOf, isSuffixOf, drop, intersperse ) +import Text.PrettyPrint.HughesPJ hiding ( Str ) +import Control.Monad.State + +type Notes = [[Block]] +type Refs = KeyTable +type WriterState = (Notes, Refs, Refs) -- first Refs is links, second pictures + +-- | Convert Pandoc to RST. +writeRST :: WriterOptions -> Pandoc -> String +writeRST opts document =  +  render $ evalState (pandocToRST opts document) ([],[],[])  + +-- | Return RST representation of document. +pandocToRST :: WriterOptions -> Pandoc -> State WriterState Doc +pandocToRST opts (Pandoc meta blocks) = do +  let before  = writerIncludeBefore opts +  let after   = writerIncludeAfter opts +      before' = if null before then empty else text before +      after'  = if null after then empty else text after +  metaBlock <- metaToRST opts meta +  let head = if (writerStandalone opts) +                then metaBlock $+$ text (writerHeader opts) +                else empty +  body <- blockListToRST opts blocks +  (notes, _, _) <- get +  notes' <- notesToRST opts (reverse notes) +  (_, refs, pics) <- get  -- note that the notes may contain refs +  refs' <- keyTableToRST opts (reverse refs) +  pics' <- pictTableToRST opts (reverse pics) +  return $ head $+$ before' $+$ body $+$ notes' $+$ text "" $+$ refs' $+$  +           pics' $+$ after' + +-- | Return RST representation of reference key table. +keyTableToRST :: WriterOptions -> KeyTable -> State WriterState Doc +keyTableToRST opts refs = mapM (keyToRST opts) refs >>= return . vcat +  +-- | Return RST representation of a reference key.  +keyToRST :: WriterOptions  +         -> ([Inline], (String, String))  +         -> State WriterState Doc +keyToRST opts (label, (src, tit)) = do +  label' <- inlineListToRST opts label +  let label'' = if ':' `elem` (render label') +                   then char '`' <> label' <> char '`' +                   else label' +  return $ text ".. _" <> label'' <> text ": " <> text src + +-- | Return RST representation of notes. +notesToRST :: WriterOptions -> [[Block]] -> State WriterState Doc +notesToRST opts notes =  +  mapM (\(num, note) -> noteToRST opts num note) (zip [1..] notes) >>=  +  return . vcat + +-- | Return RST representation of a note. +noteToRST :: WriterOptions -> Int -> [Block] -> State WriterState Doc +noteToRST opts num note = do +  contents <- blockListToRST opts note +  let marker = text ".. [" <> text (show num) <> text "] " +  return $ hang marker 3 contents  + +-- | Return RST representation of picture reference table. +pictTableToRST :: WriterOptions -> KeyTable -> State WriterState Doc +pictTableToRST opts refs = mapM (pictToRST opts) refs >>= return . vcat +  +-- | Return RST representation of a picture substitution reference.  +pictToRST :: WriterOptions  +         -> ([Inline], (String, String))  +         -> State WriterState Doc +pictToRST opts (label, (src, _)) = do +  label' <- inlineListToRST opts label +  return $ text ".. " <> char '|' <> label' <> char '|' <> text " image:: " <> +           text src + +-- | Take list of inline elements and return wrapped doc. +wrappedRST :: WriterOptions -> [Inline] -> State WriterState Doc +wrappedRST opts inlines = mapM (wrapIfNeeded opts (inlineListToRST opts)) +                          (splitBy LineBreak inlines) >>= return . vcat + +-- | Escape special characters for RST. +escapeString :: String -> String +escapeString = escapeStringUsing (backslashEscapes "`\\|*_") + +-- | Convert bibliographic information into RST header. +metaToRST :: WriterOptions -> Meta -> State WriterState Doc +metaToRST opts (Meta title authors date) = do +  title'   <- titleToRST opts title +  authors' <- authorsToRST authors +  date'    <- dateToRST date +  let toc  =  if writerTableOfContents opts +                 then text "" $+$ text ".. contents::" +                 else empty +  return $ title' $+$ authors' $+$ date' $+$ toc + +titleToRST :: WriterOptions -> [Inline] -> State WriterState Doc +titleToRST opts [] = return empty +titleToRST opts lst = do +  contents <- inlineListToRST opts lst +  let titleLength = length $ render contents +  let border = text (replicate titleLength '=') +  return $ border $+$ contents $+$ border <> text "\n" + +authorsToRST :: [String] -> State WriterState Doc +authorsToRST [] = return empty +authorsToRST (first:rest) = do +  rest' <- authorsToRST rest +  return $ (text ":Author: " <> text first) $+$ rest' + +dateToRST :: String -> State WriterState Doc +dateToRST [] = return empty +dateToRST str = return $ text ":Date: " <> text (escapeString str) + +-- | Convert Pandoc block element to RST.  +blockToRST :: WriterOptions -- ^ Options +                -> Block         -- ^ Block element +                -> State WriterState Doc  +blockToRST opts Null = return empty +blockToRST opts (Plain inlines) = wrappedRST opts inlines +blockToRST opts (Para [TeX str]) = +  let str' = if "\n" `isSuffixOf` str then str ++ "\n" else str ++ "\n\n" in +  return $ hang (text "\n.. raw:: latex\n") 3 $ vcat $ map text (lines str') +blockToRST opts (Para inlines) = do +  contents <- wrappedRST opts inlines +  return $ contents <> text "\n" +blockToRST opts (RawHtml str) =  +  let str' = if "\n" `isSuffixOf` str then str ++ "\n" else str ++ "\n\n" in +  return $ hang (text "\n.. raw:: html\n") 3 $ vcat $ map text (lines str') +blockToRST opts HorizontalRule = return $ text "--------------\n" +blockToRST opts (Header level inlines) = do +  contents <- inlineListToRST opts inlines +  let headerLength = length $ render contents +  let headerChar = if level > 5 then ' ' else "=-~^'" !! (level - 1) +  let border = text $ replicate headerLength headerChar +  return $ contents $+$ border <> text "\n" +blockToRST opts (CodeBlock str) = return $ (text "::\n") $+$  +  (nest (writerTabStop opts) $ vcat $ map text (lines str)) <> text "\n" +blockToRST opts (BlockQuote blocks) = do +  contents <- blockListToRST opts blocks  +  return $ (nest (writerTabStop opts) contents) <> text "\n" +blockToRST opts (Table caption aligns widths headers rows) =  do +  caption' <- inlineListToRST opts caption +  let caption'' = if null caption +                     then empty +                     else text "" $+$ (text "Table: " <> caption') +  headers' <- mapM (blockListToRST opts) headers +  let widthsInChars = map (floor . (78 *)) widths +  let alignHeader alignment = case alignment of +                                AlignLeft    -> leftAlignBlock +                                AlignCenter  -> centerAlignBlock +                                AlignRight   -> rightAlignBlock +                                AlignDefault -> leftAlignBlock   +  let hpipeBlocks blocks = hcatBlocks [beg, middle, end]  +        where height = maximum (map heightOfBlock blocks) +              sep    = TextBlock 3 height (replicate height " | ") +              beg    = TextBlock 2 height (replicate height "| ") +              end    = TextBlock 2 height (replicate height " |") +              middle = hcatBlocks $ intersperse sep blocks +  let makeRow = hpipeBlocks . zipWith docToBlock widthsInChars +  let head = makeRow headers' +  rows' <- mapM (\row -> do cols <- mapM (blockListToRST opts) row +                            return $ makeRow cols) rows +  let tableWidth = sum widthsInChars +  let maxRowHeight = maximum $ map heightOfBlock (head:rows') +  let border ch = char '+' <> char ch <> +                  (hcat $ intersperse (char ch <> char '+' <> char ch) $  +                          map (\l -> text $ replicate l ch) widthsInChars) <> +                  char ch <> char '+' +  let body = vcat $ intersperse (border '-') $ map blockToDoc rows' +  return $ border '-' $+$ blockToDoc head $+$ border '=' $+$ body $+$  +           border '-' $$ caption'' $$ text "" +blockToRST opts (BulletList items) = do +  contents <- mapM (bulletListItemToRST opts) items +  -- ensure that sublists have preceding blank line +  return $ text "" $+$ vcat contents <> text "\n" +blockToRST opts (OrderedList (start, style, delim) items) = do +  let markers = if start == 1 && style == DefaultStyle && delim == DefaultDelim  +                   then take (length items) $ repeat "#." +                   else take (length items) $ orderedListMarkers  +                                              (start, style, delim) +  let maxMarkerLength = maximum $ map length markers +  let markers' = map (\m -> let s = maxMarkerLength - length m +                            in  m ++ replicate s ' ') markers +  contents <- mapM (\(item, num) -> orderedListItemToRST opts item num) $ +              zip markers' items   +  -- ensure that sublists have preceding blank line +  return $ text "" $+$ vcat contents <> text "\n" +blockToRST opts (DefinitionList items) = do +  contents <- mapM (definitionListItemToRST opts) items +  return $ (vcat contents) <> text "\n" + +-- | Convert bullet list item (list of blocks) to RST. +bulletListItemToRST :: WriterOptions -> [Block] -> State WriterState Doc +bulletListItemToRST opts items = do +  contents <- blockListToRST opts items +  return $ hang (text "- ") 3 contents + +-- | Convert ordered list item (a list of blocks) to RST. +orderedListItemToRST :: WriterOptions -- ^ options +                          -> String   -- ^ marker for list item +                          -> [Block]  -- ^ list item (list of blocks) +                          -> State WriterState Doc +orderedListItemToRST opts marker items = do +  contents <- blockListToRST opts items +  return $ hang (text marker) (length marker + 1) contents  + +-- | Convert defintion list item (label, list of blocks) to RST. +definitionListItemToRST :: WriterOptions -> ([Inline], [Block]) -> State WriterState Doc +definitionListItemToRST opts (label, items) = do +  label <- inlineListToRST opts label +  contents <- blockListToRST opts items +  return $ label $+$ nest (writerTabStop opts) contents + +-- | Convert list of Pandoc block elements to RST. +blockListToRST :: WriterOptions -- ^ Options +                    -> [Block]       -- ^ List of block elements +                    -> State WriterState Doc  +blockListToRST opts blocks = +  mapM (blockToRST opts) blocks >>= return . vcat + +-- | Convert list of Pandoc inline elements to RST. +inlineListToRST :: WriterOptions -> [Inline] -> State WriterState Doc +inlineListToRST opts lst = mapM (inlineToRST opts) lst >>= return . hcat + +-- | Convert Pandoc inline element to RST. +inlineToRST :: WriterOptions -> Inline -> State WriterState Doc +inlineToRST opts (Emph lst) = do  +  contents <- inlineListToRST opts lst +  return $ char '*' <> contents <> char '*' +inlineToRST opts (Strong lst) = do +  contents <- inlineListToRST opts lst +  return $ text "**" <> contents <> text "**" +inlineToRST opts (Strikeout lst) = do  +  contents <- inlineListToRST opts lst +  return $ text "[STRIKEOUT:" <> contents <> char ']' +inlineToRST opts (Superscript lst) = do  +  contents <- inlineListToRST opts lst +  return $ text "\\ :sup:`" <> contents <> text "`\\ " +inlineToRST opts (Subscript lst) = do  +  contents <- inlineListToRST opts lst +  return $ text "\\ :sub:`" <> contents <> text "`\\ " +inlineToRST opts (Quoted SingleQuote lst) = do +  contents <- inlineListToRST opts lst +  return $ char '\'' <> contents <> char '\'' +inlineToRST opts (Quoted DoubleQuote lst) = do +  contents <- inlineListToRST opts lst +  return $ char '"' <> contents <> char '"' +inlineToRST opts EmDash = return $ text "--" +inlineToRST opts EnDash = return $ char '-' +inlineToRST opts Apostrophe = return $ char '\'' +inlineToRST opts Ellipses = return $ text "..." +inlineToRST opts (Code str) = return $ text $ "``" ++ str ++ "``" +inlineToRST opts (Str str) = return $ text $ escapeString str +inlineToRST opts (TeX str) = return $ text str +inlineToRST opts (HtmlInline str) = return empty +inlineToRST opts (LineBreak) = return $ char ' ' -- RST doesn't have linebreaks  +inlineToRST opts Space = return $ char ' ' +inlineToRST opts (Link [Code str] (src, tit)) | src == str || +                                                src == "mailto:" ++ str = do +  let srcSuffix = if isPrefixOf "mailto:" src then drop 7 src else src +  return $ text srcSuffix +inlineToRST opts (Link txt (src, tit)) = do +  let useReferenceLinks = writerReferenceLinks opts +  linktext <- inlineListToRST opts $ normalizeSpaces txt +  if useReferenceLinks +    then do (notes, refs, pics) <- get +            let refs' = if (txt, (src, tit)) `elem` refs +                           then refs +                           else (txt, (src, tit)):refs +            put (notes, refs', pics) +            return $ char '`' <> linktext <> text "`_" +    else return $ char '`' <> linktext <> text " <" <> text src <> text ">`_" +inlineToRST opts (Image alternate (source, tit)) = do +  (notes, refs, pics) <- get +  let labelsUsed = map fst pics  +  let txt = if null alternate || alternate == [Str ""] ||  +               alternate `elem` labelsUsed +               then [Str $ "image" ++ show (length refs)] +               else alternate +  let pics' = if (txt, (source, tit)) `elem` pics +                 then pics +                 else (txt, (source, tit)):pics +  put (notes, refs, pics') +  label <- inlineListToRST opts txt +  return $ char '|' <> label <> char '|' +inlineToRST opts (Note contents) = do  +  -- add to notes in state +  modify (\(notes, refs, pics) -> (contents:notes, refs, pics)) +  (notes, _, _) <- get +  let ref = show $ (length notes) +  return $ text " [" <> text ref <> text "]_" diff --git a/src/Text/Pandoc/Writers/RTF.hs b/src/Text/Pandoc/Writers/RTF.hs new file mode 100644 index 000000000..3bd5c63b2 --- /dev/null +++ b/src/Text/Pandoc/Writers/RTF.hs @@ -0,0 +1,286 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Writers.RTF +   Copyright   : Copyright (C) 2006-7 John MacFarlane +   License     : GNU GPL, version 2 or above  + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha  +   Portability : portable + +Conversion of 'Pandoc' documents to RTF (rich text format). +-} +module Text.Pandoc.Writers.RTF ( writeRTF ) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Regex ( matchRegexAll, mkRegex ) +import Data.List ( isSuffixOf ) +import Data.Char ( ord ) + +-- | Convert Pandoc to a string in rich text format. +writeRTF :: WriterOptions -> Pandoc -> String +writeRTF options (Pandoc meta blocks) =  +  let head = if writerStandalone options +                then rtfHeader (writerHeader options) meta  +                else "" +      toc  = if writerTableOfContents options +                then tableOfContents $ filter isHeaderBlock blocks +                else ""  +      foot = if writerStandalone options then "\n}\n" else ""  +      body = writerIncludeBefore options ++  +             concatMap (blockToRTF 0 AlignDefault) blocks ++  +             writerIncludeAfter options +  in  head ++ toc ++ body ++ foot + +-- | Construct table of contents from list of header blocks. +tableOfContents :: [Block] -> String  +tableOfContents headers = +  let contentsTree = hierarchicalize headers +  in  concatMap (blockToRTF 0 AlignDefault) $  +      [Header 1 [Str "Contents"],  +       BulletList (map elementToListItem contentsTree)] + +elementToListItem :: Element -> [Block] +elementToListItem (Blk _) = [] +elementToListItem (Sec sectext subsecs) = [Plain sectext] ++ +  if null subsecs +     then [] +     else [BulletList (map elementToListItem subsecs)] + +-- | Convert unicode characters (> 127) into rich text format representation. +handleUnicode :: String -> String +handleUnicode [] = [] +handleUnicode (c:cs) = +  if ord c > 127  +     then '\\':'u':(show (ord c)) ++ "?" ++ handleUnicode cs +     else c:(handleUnicode cs) + +-- | Escape special characters. +escapeSpecial :: String -> String +escapeSpecial = escapeStringUsing (('\t',"\\tab "):(backslashEscapes "{\\}")) + +-- | Escape strings as needed for rich text format. +stringToRTF :: String -> String +stringToRTF = handleUnicode . escapeSpecial + +-- | Escape things as needed for code block in RTF. +codeStringToRTF :: String -> String +codeStringToRTF str = joinWithSep "\\line\n" $ lines (stringToRTF str) + +-- | Deal with raw LaTeX. +latexToRTF :: String -> String +latexToRTF str = "{\\cf1 " ++ (stringToRTF str) ++ "\\cf0 } " + +-- | Make a paragraph with first-line indent, block indent, and space after. +rtfParSpaced :: Int       -- ^ space after (in twips) +             -> Int       -- ^ block indent (in twips) +             -> Int       -- ^ first line indent (relative to block) (in twips) +             -> Alignment -- ^ alignment +             -> String    -- ^ string with content +             -> String  +rtfParSpaced spaceAfter indent firstLineIndent alignment content =  +  let alignString = case alignment of +                           AlignLeft -> "\\ql " +                           AlignRight -> "\\qr " +                           AlignCenter -> "\\qc " +                           AlignDefault -> "\\ql " +  in  "{\\pard " ++ alignString ++ +      "\\f0 \\sa" ++ (show spaceAfter) ++ " \\li" ++ (show indent) ++  +      " \\fi" ++ (show firstLineIndent) ++ " " ++ content ++ "\\par}\n" + +-- | Default paragraph.  +rtfPar :: Int       -- ^ block indent (in twips) +       -> Int       -- ^ first line indent (relative to block) (in twips) +       -> Alignment -- ^ alignment +       -> String    -- ^ string with content +       -> String  +rtfPar = rtfParSpaced 180  + +-- | Compact paragraph (e.g. for compact list items). +rtfCompact ::  Int       -- ^ block indent (in twips) +           ->  Int       -- ^ first line indent (relative to block) (in twips) +           ->  Alignment -- ^ alignment +           ->  String    -- ^ string with content +           ->  String  +rtfCompact = rtfParSpaced 0  + +-- number of twips to indent +indentIncrement = 720 +listIncrement = 360 + +-- | Returns appropriate bullet list marker for indent level. +bulletMarker :: Int -> String +bulletMarker indent = case indent `mod` 720 of +                             0         -> "\\bullet " +                             otherwise -> "\\endash " + +-- | Returns appropriate (list of) ordered list markers for indent level. +orderedMarkers :: Int -> ListAttributes -> [String] +orderedMarkers indent (start, style, delim) =  +  if style == DefaultStyle && delim == DefaultDelim +     then case indent `mod` 720 of +              0         -> orderedListMarkers (start, Decimal, Period) +              otherwise -> orderedListMarkers (start, LowerAlpha, Period) +     else orderedListMarkers (start, style, delim) + +-- | Returns RTF header. +rtfHeader :: String    -- ^ header text +          -> Meta      -- ^ bibliographic information +          -> String +rtfHeader headerText (Meta title authors date) = +  let titletext = if null title +                     then ""  +                     else rtfPar 0 0 AlignCenter $ +                          "\\b \\fs36 " ++ inlineListToRTF title +      authorstext = if null authors +                       then ""  +                       else rtfPar 0 0 AlignCenter (" " ++ (joinWithSep "\\" $ +                                                    map stringToRTF authors)) +      datetext = if date == ""  +                    then "" +                    else rtfPar 0 0 AlignCenter (" " ++ stringToRTF date) in +  let spacer = if null (titletext ++ authorstext ++ datetext) +                  then "" +                  else rtfPar 0 0 AlignDefault "" in +  headerText ++ titletext ++ authorstext ++ datetext ++ spacer + +-- | Convert Pandoc block element to RTF. +blockToRTF :: Int       -- ^ indent level +           -> Alignment -- ^ alignment +           -> Block     -- ^ block to convert +           -> String +blockToRTF _ _ Null = "" +blockToRTF indent alignment (Plain lst) =  +  rtfCompact indent 0 alignment $ inlineListToRTF lst +blockToRTF indent alignment (Para lst) =  +  rtfPar indent 0 alignment $ inlineListToRTF lst +blockToRTF indent alignment (BlockQuote lst) =  +  concatMap (blockToRTF (indent + indentIncrement) alignment) lst  +blockToRTF indent _ (CodeBlock str) = +  rtfPar indent 0 AlignLeft ("\\f1 " ++ (codeStringToRTF str)) +blockToRTF _ _ (RawHtml str) = "" +blockToRTF indent alignment (BulletList lst) = spaceAtEnd $  +  concatMap (listItemToRTF alignment indent (bulletMarker indent)) lst +blockToRTF indent alignment (OrderedList attribs lst) = spaceAtEnd $ concat $  +  zipWith (listItemToRTF alignment indent) (orderedMarkers indent attribs) lst +blockToRTF indent alignment (DefinitionList lst) = spaceAtEnd $  +  concatMap (definitionListItemToRTF alignment indent) lst +blockToRTF indent _ HorizontalRule =  +  rtfPar indent 0 AlignCenter "\\emdash\\emdash\\emdash\\emdash\\emdash" +blockToRTF indent alignment (Header level lst) = rtfPar indent 0 alignment $ +  "\\b \\fs" ++ (show (40 - (level * 4))) ++ " " ++ inlineListToRTF lst +blockToRTF indent alignment (Table caption aligns sizes headers rows) =  +  tableRowToRTF True indent aligns sizes headers ++  +  concatMap (tableRowToRTF False indent aligns sizes) rows ++ +  rtfPar indent 0 alignment (inlineListToRTF caption) + +tableRowToRTF :: Bool -> Int -> [Alignment] -> [Float] -> [[Block]] -> String  +tableRowToRTF header indent aligns sizes cols = +  let columns = concat $ zipWith (tableItemToRTF indent) aligns cols +      totalTwips = 6 * 1440 -- 6 inches +      rightEdges = tail $ scanl (\sofar new -> sofar + floor (new * totalTwips)) +                                0 sizes +      cellDefs = map (\edge -> (if header +                                   then "\\clbrdrb\\brdrs" +                                   else "") ++ "\\cellx" ++ show edge) +                     rightEdges +      start = "{\n\\trowd \\trgaph120\n" ++ concat cellDefs ++ "\n" ++ +              "\\trkeep\\intbl\n{\n" +      end = "}\n\\intbl\\row}\n" +  in  start ++ columns ++ end + +tableItemToRTF :: Int -> Alignment -> [Block] -> String  +tableItemToRTF indent alignment item = +  let contents = concatMap (blockToRTF indent alignment) item +  in  "{\\intbl " ++ contents ++ "\\cell}\n" + +-- | Ensure that there's the same amount of space after compact +-- lists as after regular lists. +spaceAtEnd :: String -> String +spaceAtEnd str =  +  if isSuffixOf "\\par}\n" str +     then (take ((length str) - 6) str) ++ "\\sa180\\par}\n" +     else str + +-- | Convert list item (list of blocks) to RTF. +listItemToRTF :: Alignment  -- ^ alignment +              -> Int        -- ^ indent level +              -> String     -- ^ list start marker +              -> [Block]    -- ^ list item (list of blocks) +              -> [Char] +listItemToRTF alignment indent marker [] =  +  rtfCompact (indent + listIncrement) (0 - listIncrement) alignment  +             (marker ++ "\\tx" ++ (show listIncrement) ++ "\\tab ")  +listItemToRTF alignment indent marker list =  +  let (first:rest) = map (blockToRTF (indent + listIncrement) alignment) list in +  -- insert the list marker into the (processed) first block +  let modFirst = case matchRegexAll (mkRegex "\\\\fi-?[0-9]+") first of +                    Just (before, matched, after, _) ->  +                                before ++ "\\fi" ++ show (0 - listIncrement) ++  +                                " " ++ marker ++ "\\tx" ++  +                                show listIncrement ++ "\\tab" ++ after +                    Nothing -> first in +  modFirst ++ concat rest + +-- | Convert definition list item (label, list of blocks) to RTF. +definitionListItemToRTF :: Alignment          -- ^ alignment +                        -> Int                -- ^ indent level +                        -> ([Inline],[Block]) -- ^ list item (list of blocks) +                        -> [Char] +definitionListItemToRTF alignment indent (label, items) = +  let labelText = blockToRTF indent alignment (Plain label) +      itemsText = concatMap (blockToRTF (indent + listIncrement) alignment) items +  in  labelText ++ itemsText  + +-- | Convert list of inline items to RTF. +inlineListToRTF :: [Inline]   -- ^ list of inlines to convert +                -> String +inlineListToRTF lst = concatMap inlineToRTF lst + +-- | Convert inline item to RTF. +inlineToRTF :: Inline         -- ^ inline to convert +            -> String +inlineToRTF (Emph lst) = "{\\i " ++ (inlineListToRTF lst) ++ "} " +inlineToRTF (Strong lst) = "{\\b " ++ (inlineListToRTF lst) ++ "} " +inlineToRTF (Strikeout lst) = "{\\strike " ++ (inlineListToRTF lst) ++ "} " +inlineToRTF (Superscript lst) = "{\\super " ++ (inlineListToRTF lst) ++ "} " +inlineToRTF (Subscript lst) = "{\\sub " ++ (inlineListToRTF lst) ++ "} " +inlineToRTF (Quoted SingleQuote lst) =  +  "\\u8216'" ++ (inlineListToRTF lst) ++ "\\u8217'" +inlineToRTF (Quoted DoubleQuote lst) =  +  "\\u8220\"" ++ (inlineListToRTF lst) ++ "\\u8221\"" +inlineToRTF Apostrophe = "\\u8217'" +inlineToRTF Ellipses = "\\u8230?" +inlineToRTF EmDash = "\\u8212-" +inlineToRTF EnDash = "\\u8211-" +inlineToRTF (Code str) = "{\\f1 " ++ (codeStringToRTF str) ++ "} " +inlineToRTF (Str str) = stringToRTF str +inlineToRTF (TeX str) = latexToRTF str +inlineToRTF (HtmlInline str) = "" +inlineToRTF (LineBreak) = "\\line " +inlineToRTF Space = " " +inlineToRTF (Link text (src, tit)) =  +  "{\\field{\\*\\fldinst{HYPERLINK \"" ++ (codeStringToRTF src) ++  +  "\"}}{\\fldrslt{\\ul\n" ++ (inlineListToRTF text) ++ "\n}}}\n" +inlineToRTF (Image alternate (source, tit)) =  +  "{\\cf1 [image: " ++ source ++ "]\\cf0}"  +inlineToRTF (Note contents) = +  "{\\super\\chftn}{\\*\\footnote\\chftn\\~\\plain\\pard " ++  +  (concatMap (blockToRTF 0 AlignDefault) contents) ++ "}"  | 
