diff options
Diffstat (limited to 'src/Text/Pandoc')
96 files changed, 44676 insertions, 0 deletions
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs new file mode 100644 index 000000000..be8f26811 --- /dev/null +++ b/src/Text/Pandoc/App.hs @@ -0,0 +1,1444 @@ +{-# LANGUAGE CPP, TupleSections, ScopedTypeVariables, PatternGuards #-} +{- +Copyright (C) 2006-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.App + Copyright : Copyright (C) 2006-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley@edu> + Stability : alpha + Portability : portable + +Does a pandoc conversion based on command-line options. +-} +module Text.Pandoc.App ( + convertWithOpts + , Opt(..) + , defaultOpts + , parseOptions + , options + ) where +import Text.Pandoc +import Text.Pandoc.Builder (setMeta) +import Text.Pandoc.PDF (makePDF) +import Text.Pandoc.Walk (walk) +import Text.Pandoc.Shared ( tabFilter, readDataFileUTF8, + headerShift, err, openURL, safeRead, + readDataFile ) +import Text.Pandoc.MediaBag ( mediaDirectory, extractMediaBag, MediaBag ) +import Text.Pandoc.XML ( toEntities ) +import Text.Pandoc.Highlighting (highlightingStyles) +import Text.Pandoc.SelfContained ( makeSelfContained ) +import Text.Pandoc.Process (pipeProcess) +import Skylighting ( Style, defaultSyntaxMap, Syntax(..) ) +import Text.Printf +import System.Environment ( getEnvironment, getProgName, getArgs ) +import Control.Applicative ((<|>)) +import System.Exit ( ExitCode (..), exitSuccess ) +import System.FilePath +import Data.Char ( toLower, toUpper ) +import Data.List ( intercalate, isPrefixOf, isSuffixOf, sort ) +import System.Directory ( getAppUserDataDirectory, findExecutable, + doesFileExist, Permissions(..), getPermissions ) +import System.IO ( stdout, stderr ) +import System.IO.Error ( isDoesNotExistError ) +import qualified Control.Exception as E +import Control.Exception.Extensible ( throwIO ) +import qualified Text.Pandoc.UTF8 as UTF8 +import Control.Monad +import Control.Monad.Trans +import Data.Maybe (fromMaybe, isNothing, isJust) +import Data.Foldable (foldrM) +import Network.URI (parseURI, isURI, URI(..)) +import qualified Data.ByteString.Lazy as B +import qualified Data.ByteString as BS +import qualified Data.Map as M +import Data.Aeson (eitherDecode', encode) +import Data.Yaml (decode) +import qualified Data.Yaml as Yaml +import qualified Data.Text as T +import System.Console.GetOpt +import Text.Pandoc.Class (withMediaBag, PandocIO, getLog) +import Paths_pandoc (getDataDir) +#ifndef _WINDOWS +import System.Posix.Terminal (queryTerminal) +import System.Posix.IO (stdOutput) +#endif + +parseOptions :: [OptDescr (Opt -> IO Opt)] -> Opt -> IO Opt +parseOptions options' defaults = do + rawArgs <- map UTF8.decodeArg <$> getArgs + prg <- getProgName + + let (actions, args, unrecognizedOpts, errors) = + getOpt' Permute options' rawArgs + + let unknownOptionErrors = foldr handleUnrecognizedOption [] unrecognizedOpts + + unless (null errors && null unknownOptionErrors) $ + err 2 $ concat errors ++ unlines unknownOptionErrors ++ + ("Try " ++ prg ++ " --help for more information.") + + -- thread option data structure through all supplied option actions + opts <- foldl (>>=) (return defaults) actions + return (opts{ optInputFiles = args }) + +convertWithOpts :: Opt -> IO () +convertWithOpts opts = do + let args = optInputFiles opts + let outputFile = optOutputFile opts + let filters = optFilters opts + let verbosity = optVerbosity opts + + when (optDumpArgs opts) $ + do UTF8.hPutStrLn stdout outputFile + mapM_ (UTF8.hPutStrLn stdout) args + exitSuccess + + epubStylesheet <- case optEpubStylesheet opts of + Nothing -> return Nothing + Just fp -> Just <$> UTF8.readFile fp + + epubMetadata <- case optEpubMetadata opts of + Nothing -> return Nothing + Just fp -> Just <$> UTF8.readFile fp + + let csscdn = "https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.6.0/katex.min.css" + let mathMethod = + case (optKaTeXJS opts, optKaTeXStylesheet opts) of + (Nothing, _) -> optHTMLMathMethod opts + (Just js, ss) -> KaTeX js (fromMaybe csscdn ss) + + + -- --bibliography implies -F pandoc-citeproc for backwards compatibility: + let needsCiteproc = isJust (lookup "bibliography" (optMetadata opts)) && + optCiteMethod opts `notElem` [Natbib, Biblatex] && + "pandoc-citeproc" `notElem` map takeBaseName filters + let filters' = if needsCiteproc then "pandoc-citeproc" : filters + else filters + + let sources = case args of + [] -> ["-"] + xs | optIgnoreArgs opts -> ["-"] + | otherwise -> xs + + datadir <- case optDataDir opts of + Nothing -> E.catch + (Just <$> getAppUserDataDirectory "pandoc") + (\e -> let _ = (e :: E.SomeException) + in return Nothing) + Just _ -> return $ optDataDir opts + + -- assign reader and writer based on options and filenames + let readerName = case optReader opts of + Nothing -> defaultReaderName + (if any isURI sources + then "html" + else "markdown") sources + Just x -> map toLower x + + let writerName = case optWriter opts of + Nothing -> defaultWriterName outputFile + Just x -> map toLower x + let format = takeWhile (`notElem` ['+','-']) + $ takeFileName writerName -- in case path to lua script + + let pdfOutput = map toLower (takeExtension outputFile) == ".pdf" + + let laTeXOutput = format `elem` ["latex", "beamer"] + let conTeXtOutput = format == "context" + let html5Output = format == "html5" || format == "html" + + -- disabling the custom writer for now + writer <- if ".lua" `isSuffixOf` format + -- note: use non-lowercased version writerName + then error "custom writers disabled for now" + else case getWriter writerName of + Left e -> err 9 $ + if format == "pdf" + then e ++ + "\nTo create a pdf with pandoc, use " ++ + "the latex or beamer writer and specify\n" ++ + "an output file with .pdf extension " ++ + "(pandoc -t latex -o filename.pdf)." + else e + Right w -> return (w :: Writer PandocIO) + + -- TODO: we have to get the input and the output into the state for + -- the sake of the text2tags reader. + reader <- case getReader readerName of + Right r -> return (r :: Reader PandocIO) + Left e -> err 7 e' + where e' = case readerName of + "pdf" -> e ++ + "\nPandoc can convert to PDF, but not from PDF." + "doc" -> e ++ + "\nPandoc can convert from DOCX, but not from DOC.\nTry using Word to save your DOC file as DOCX, and convert that with pandoc." + _ -> e + + let standalone = optStandalone opts || not (isTextFormat format) || pdfOutput + + templ <- case optTemplate opts of + _ | not standalone -> return Nothing + Nothing -> do + deftemp <- getDefaultTemplate datadir format + case deftemp of + Left e -> throwIO e + Right t -> return (Just t) + Just tp -> do + -- strip off extensions + let tp' = case takeExtension tp of + "" -> tp <.> format + _ -> tp + Just <$> E.catch (UTF8.readFile tp') + (\e -> if isDoesNotExistError e + then E.catch + (readDataFileUTF8 datadir + ("templates" </> tp')) + (\e' -> let _ = (e' :: E.SomeException) + in throwIO e') + else throwIO e) + + let addStringAsVariable varname s vars = return $ (varname, s) : vars + + let addContentsAsVariable varname fp vars = do + s <- UTF8.readFile fp + return $ (varname, s) : vars + + -- note: this reverses the list constructed in option parsing, + -- which in turn was reversed from the command-line order, + -- so we end up with the correct order in the variable list: + let withList _ [] vars = return vars + withList f (x:xs) vars = f x vars >>= withList f xs + + variables <- return (optVariables opts) + >>= + withList (addContentsAsVariable "include-before") + (optIncludeBeforeBody opts) + >>= + withList (addContentsAsVariable "include-after") + (optIncludeAfterBody opts) + >>= + withList (addContentsAsVariable "header-includes") + (optIncludeInHeader opts) + >>= + withList (addStringAsVariable "css") (optCss opts) + >>= + maybe return (addStringAsVariable "title-prefix") (optTitlePrefix opts) + >>= + maybe return (addStringAsVariable "epub-cover-image") + (optEpubCoverImage opts) + >>= + (\vars -> case mathMethod of + LaTeXMathML Nothing -> do + s <- readDataFileUTF8 datadir "LaTeXMathML.js" + return $ ("mathml-script", s) : vars + _ -> return vars) + >>= + (\vars -> if format == "dzslides" + then do + dztempl <- readDataFileUTF8 datadir + ("dzslides" </> "template.html") + let dzline = "<!-- {{{{ dzslides core" + let dzcore = unlines + $ dropWhile (not . (dzline `isPrefixOf`)) + $ lines dztempl + return $ ("dzslides-core", dzcore) : vars + else return vars) + + let sourceURL = case sources of + [] -> Nothing + (x:_) -> case parseURI x of + Just u + | uriScheme u `elem` ["http:","https:"] -> + Just $ show u{ uriQuery = "", + uriFragment = "" } + _ -> Nothing + + let readerOpts = def{ readerStandalone = standalone + , readerColumns = optColumns opts + , readerTabStop = optTabStop opts + , readerIndentedCodeClasses = optIndentedCodeClasses opts + , readerApplyMacros = not laTeXOutput + , readerDefaultImageExtension = + optDefaultImageExtension opts + , readerTrackChanges = optTrackChanges opts + } + + highlightStyle <- lookupHighlightStyle $ optHighlightStyle opts + + let writerOptions = def { writerTemplate = templ, + writerVariables = variables, + writerTabStop = optTabStop opts, + writerTableOfContents = optTableOfContents opts, + writerHTMLMathMethod = mathMethod, + writerIncremental = optIncremental opts, + writerCiteMethod = optCiteMethod opts, + writerNumberSections = optNumberSections opts, + writerNumberOffset = optNumberOffset opts, + writerSectionDivs = optSectionDivs opts, + writerReferenceLinks = optReferenceLinks opts, + writerReferenceLocation = optReferenceLocation opts, + writerDpi = optDpi opts, + writerWrapText = optWrapText opts, + writerColumns = optColumns opts, + writerEmailObfuscation = optEmailObfuscation opts, + writerIdentifierPrefix = optIdentifierPrefix opts, + writerSourceURL = sourceURL, + writerUserDataDir = datadir, + writerHtmlQTags = optHtmlQTags opts, + writerTopLevelDivision = optTopLevelDivision opts, + writerListings = optListings opts, + writerSlideLevel = optSlideLevel opts, + writerHighlightStyle = highlightStyle, + writerSetextHeaders = optSetextHeaders opts, + writerEpubMetadata = epubMetadata, + writerEpubStylesheet = epubStylesheet, + writerEpubFonts = optEpubFonts opts, + writerEpubChapterLevel = optEpubChapterLevel opts, + writerTOCDepth = optTOCDepth opts, + writerReferenceDoc = optReferenceDoc opts, + writerLaTeXArgs = optLaTeXEngineArgs opts + } + + +#ifdef _WINDOWS + let istty = True +#else + istty <- queryTerminal stdOutput +#endif + when (istty && not (isTextFormat format) && outputFile == "-") $ + err 5 $ "Cannot write " ++ format ++ " output to stdout.\n" ++ + "Specify an output file using the -o option." + + + let transforms = case optBaseHeaderLevel opts of + x | x > 1 -> [headerShift (x - 1)] + | otherwise -> [] + + let convertTabs = tabFilter (if optPreserveTabs opts || readerName == "t2t" + then 0 + else optTabStop opts) + + readSources :: (Functor m, MonadIO m) => [FilePath] -> m String + readSources srcs = convertTabs . intercalate "\n" <$> + mapM readSource srcs + + let runIO' :: PandocIO a -> IO a + runIO' f = do + (res, reports) <- runIOorExplode $ do + setVerbosity verbosity + x <- f + rs <- getLog + return (x, rs) + case optLogFile opts of + Nothing -> return () + Just logfile -> B.writeFile logfile (encodeLogMessages reports) + let isWarning msg = messageVerbosity msg == WARNING + when (optFailIfWarnings opts && any isWarning reports) $ + err 3 "Failing because there were warnings." + return res + + let sourceToDoc :: [FilePath] -> PandocIO (Pandoc, MediaBag) + sourceToDoc sources' = + case reader of + StringReader r + | optFileScope opts || readerName == "json" -> do + pairs <- mapM + (readSource >=> withMediaBag . r readerOpts) sources + return (mconcat (map fst pairs), mconcat (map snd pairs)) + | otherwise -> + readSources sources' >>= withMediaBag . r readerOpts + ByteStringReader r -> do + pairs <- mapM (readFile' >=> + withMediaBag . r readerOpts) sources + return (mconcat (map fst pairs), mconcat (map snd pairs)) + + runIO' $ do + (doc, media) <- sourceToDoc sources + doc' <- (maybe return (extractMedia media) (optExtractMedia opts) >=> + return . flip (foldr addMetadata) (optMetadata opts) >=> + applyTransforms transforms >=> + applyFilters datadir filters' [format]) doc + + case writer of + -- StringWriter f -> f writerOptions doc' >>= writerFn outputFile + ByteStringWriter f -> f writerOptions doc' >>= writeFnBinary outputFile + StringWriter f + | pdfOutput -> do + -- make sure writer is latex or beamer or context or html5 + unless (laTeXOutput || conTeXtOutput || html5Output) $ + err 47 $ "cannot produce pdf output with " ++ format ++ + " writer" + + let pdfprog = case () of + _ | conTeXtOutput -> "context" + _ | html5Output -> "wkhtmltopdf" + _ -> optLaTeXEngine opts + -- check for pdf creating program + mbPdfProg <- liftIO $ findExecutable pdfprog + when (isNothing mbPdfProg) $ + err 41 $ pdfprog ++ " not found. " ++ + pdfprog ++ " is needed for pdf output." + + res <- makePDF pdfprog f writerOptions verbosity media doc' + case res of + Right pdf -> writeFnBinary outputFile pdf + Left err' -> liftIO $ do + B.hPutStr stderr err' + B.hPut stderr $ B.pack [10] + err 43 "Error producing PDF" + | otherwise -> do + let htmlFormat = format `elem` + ["html","html4","html5","s5","slidy","slideous","dzslides","revealjs"] + selfcontain = if optSelfContained opts && htmlFormat + then makeSelfContained writerOptions media + else return + handleEntities = if htmlFormat && optAscii opts + then toEntities + else id + output <- f writerOptions doc' + selfcontain (output ++ ['\n' | not standalone]) >>= + writerFn outputFile . handleEntities + +type Transform = Pandoc -> Pandoc + +isTextFormat :: String -> Bool +isTextFormat s = s `notElem` ["odt","docx","epub","epub3"] + +externalFilter :: MonadIO m => FilePath -> [String] -> Pandoc -> m Pandoc +externalFilter f args' d = liftIO $ do + exists <- doesFileExist f + isExecutable <- if exists + then executable <$> getPermissions f + else return True + let (f', args'') = if exists + then case map toLower (takeExtension f) of + _ | isExecutable -> ("." </> f, args') + ".py" -> ("python", f:args') + ".hs" -> ("runhaskell", f:args') + ".pl" -> ("perl", f:args') + ".rb" -> ("ruby", f:args') + ".php" -> ("php", f:args') + ".js" -> ("node", f:args') + _ -> (f, args') + else (f, args') + unless (exists && isExecutable) $ do + mbExe <- findExecutable f' + when (isNothing mbExe) $ + err 83 $ "Error running filter " ++ f ++ ":\n" ++ + "Could not find executable '" ++ f' ++ "'." + env <- getEnvironment + let env' = Just $ ("PANDOC_VERSION", pandocVersion) : env + (exitcode, outbs) <- E.handle filterException $ + pipeProcess env' f' args'' $ encode d + case exitcode of + ExitSuccess -> return $ either error id $ eitherDecode' outbs + ExitFailure ec -> err 83 $ "Error running filter " ++ f ++ "\n" ++ + "Filter returned error status " ++ show ec + where filterException :: E.SomeException -> IO a + filterException e = err 83 $ "Error running filter " ++ f ++ "\n" ++ + show e + +-- | Data structure for command line options. +data Opt = Opt + { optTabStop :: Int -- ^ Number of spaces per tab + , optPreserveTabs :: Bool -- ^ Preserve tabs instead of converting to spaces + , optStandalone :: Bool -- ^ Include header, footer + , optReader :: Maybe String -- ^ Reader format + , optWriter :: Maybe String -- ^ Writer format + , optTableOfContents :: Bool -- ^ Include table of contents + , optBaseHeaderLevel :: Int -- ^ Base header level + , optTemplate :: Maybe FilePath -- ^ Custom template + , optVariables :: [(String,String)] -- ^ Template variables to set + , optMetadata :: [(String, String)] -- ^ Metadata fields to set + , optOutputFile :: FilePath -- ^ Name of output file + , optInputFiles :: [FilePath] -- ^ Names of input files + , optNumberSections :: Bool -- ^ Number sections in LaTeX + , optNumberOffset :: [Int] -- ^ Starting number for sections + , optSectionDivs :: Bool -- ^ Put sections in div tags in HTML + , optIncremental :: Bool -- ^ Use incremental lists in Slidy/Slideous/S5 + , optSelfContained :: Bool -- ^ Make HTML accessible offline + , optHtmlQTags :: Bool -- ^ Use <q> tags in HTML + , optHighlightStyle :: Maybe String -- ^ Style to use for highlighted code + , optTopLevelDivision :: TopLevelDivision -- ^ Type of the top-level divisions + , optHTMLMathMethod :: HTMLMathMethod -- ^ Method to print HTML math + , optReferenceDoc :: Maybe FilePath -- ^ Path of reference doc + , optEpubStylesheet :: Maybe FilePath -- ^ EPUB stylesheet + , optEpubMetadata :: Maybe FilePath -- ^ EPUB metadata + , optEpubFonts :: [FilePath] -- ^ EPUB fonts to embed + , optEpubChapterLevel :: Int -- ^ Header level at which to split chapters + , optEpubCoverImage :: Maybe FilePath -- ^ Cover image for epub + , optTOCDepth :: Int -- ^ Number of levels to include in TOC + , optDumpArgs :: Bool -- ^ Output command-line arguments + , optIgnoreArgs :: Bool -- ^ Ignore command-line arguments + , optVerbosity :: Verbosity -- ^ Verbosity of diagnostic output + , optLogFile :: Maybe FilePath -- ^ File to write JSON log output + , optFailIfWarnings :: Bool -- ^ Fail on warnings + , optReferenceLinks :: Bool -- ^ Use reference links in writing markdown, rst + , optReferenceLocation :: ReferenceLocation -- ^ location for footnotes and link references in markdown output + , optDpi :: Int -- ^ Dpi + , optWrapText :: WrapOption -- ^ Options for wrapping text + , optColumns :: Int -- ^ Line length in characters + , optFilters :: [FilePath] -- ^ Filters to apply + , optEmailObfuscation :: ObfuscationMethod + , optIdentifierPrefix :: String + , optIndentedCodeClasses :: [String] -- ^ Default classes for indented code blocks + , optDataDir :: Maybe FilePath + , optCiteMethod :: CiteMethod -- ^ Method to output cites + , optListings :: Bool -- ^ Use listings package for code blocks + , optLaTeXEngine :: String -- ^ Program to use for latex -> pdf + , optLaTeXEngineArgs :: [String] -- ^ Flags to pass to the latex-engine + , optSlideLevel :: Maybe Int -- ^ Header level that creates slides + , optSetextHeaders :: Bool -- ^ Use atx headers for markdown level 1-2 + , optAscii :: Bool -- ^ Use ascii characters only in html + , optDefaultImageExtension :: String -- ^ Default image extension + , optExtractMedia :: Maybe FilePath -- ^ Path to extract embedded media + , optTrackChanges :: TrackChanges -- ^ Accept or reject MS Word track-changes. + , optFileScope :: Bool -- ^ Parse input files before combining + , optKaTeXStylesheet :: Maybe String -- ^ Path to stylesheet for KaTeX + , optKaTeXJS :: Maybe String -- ^ Path to js file for KaTeX + , optTitlePrefix :: Maybe String -- ^ Prefix for title + , optCss :: [FilePath] -- ^ CSS files to link to + , optIncludeBeforeBody :: [FilePath] -- ^ Files to include before + , optIncludeAfterBody :: [FilePath] -- ^ Files to include after body + , optIncludeInHeader :: [FilePath] -- ^ Files to include in header + } + +-- | Defaults for command-line options. +defaultOpts :: Opt +defaultOpts = Opt + { optTabStop = 4 + , optPreserveTabs = False + , optStandalone = False + , optReader = Nothing + , optWriter = Nothing + , optTableOfContents = False + , optBaseHeaderLevel = 1 + , optTemplate = Nothing + , optVariables = [] + , optMetadata = [] + , optOutputFile = "-" -- "-" means stdout + , optInputFiles = [] + , optNumberSections = False + , optNumberOffset = [0,0,0,0,0,0] + , optSectionDivs = False + , optIncremental = False + , optSelfContained = False + , optHtmlQTags = False + , optHighlightStyle = Just "pygments" + , optTopLevelDivision = TopLevelDefault + , optHTMLMathMethod = PlainMath + , optReferenceDoc = Nothing + , optEpubStylesheet = Nothing + , optEpubMetadata = Nothing + , optEpubFonts = [] + , optEpubChapterLevel = 1 + , optEpubCoverImage = Nothing + , optTOCDepth = 3 + , optDumpArgs = False + , optIgnoreArgs = False + , optVerbosity = WARNING + , optLogFile = Nothing + , optFailIfWarnings = False + , optReferenceLinks = False + , optReferenceLocation = EndOfDocument + , optDpi = 96 + , optWrapText = WrapAuto + , optColumns = 72 + , optFilters = [] + , optEmailObfuscation = NoObfuscation + , optIdentifierPrefix = "" + , optIndentedCodeClasses = [] + , optDataDir = Nothing + , optCiteMethod = Citeproc + , optListings = False + , optLaTeXEngine = "pdflatex" + , optLaTeXEngineArgs = [] + , optSlideLevel = Nothing + , optSetextHeaders = True + , optAscii = False + , optDefaultImageExtension = "" + , optExtractMedia = Nothing + , optTrackChanges = AcceptChanges + , optFileScope = False + , optKaTeXStylesheet = Nothing + , optKaTeXJS = Nothing + , optTitlePrefix = Nothing + , optCss = [] + , optIncludeBeforeBody = [] + , optIncludeAfterBody = [] + , optIncludeInHeader = [] + } + +addMetadata :: (String, String) -> Pandoc -> Pandoc +addMetadata (k, v) (Pandoc meta bs) = Pandoc meta' bs + where meta' = case lookupMeta k meta of + Nothing -> setMeta k v' meta + Just (MetaList xs) -> + setMeta k (MetaList (xs ++ [v'])) meta + Just x -> setMeta k (MetaList [x, v']) meta + v' = readMetaValue v + +readMetaValue :: String -> MetaValue +readMetaValue s = case decode (UTF8.fromString s) of + Just (Yaml.String t) -> MetaString $ T.unpack t + Just (Yaml.Bool b) -> MetaBool b + _ -> MetaString s + +-- Determine default reader based on source file extensions +defaultReaderName :: String -> [FilePath] -> String +defaultReaderName fallback [] = fallback +defaultReaderName fallback (x:xs) = + case takeExtension (map toLower x) of + ".xhtml" -> "html" + ".html" -> "html" + ".htm" -> "html" + ".md" -> "markdown" + ".markdown" -> "markdown" + ".tex" -> "latex" + ".latex" -> "latex" + ".ltx" -> "latex" + ".rst" -> "rst" + ".org" -> "org" + ".lhs" -> "markdown+lhs" + ".db" -> "docbook" + ".opml" -> "opml" + ".wiki" -> "mediawiki" + ".dokuwiki" -> "dokuwiki" + ".textile" -> "textile" + ".native" -> "native" + ".json" -> "json" + ".docx" -> "docx" + ".t2t" -> "t2t" + ".epub" -> "epub" + ".odt" -> "odt" + ".pdf" -> "pdf" -- so we get an "unknown reader" error + ".doc" -> "doc" -- so we get an "unknown reader" error + _ -> defaultReaderName fallback xs + +-- Determine default writer based on output file extension +defaultWriterName :: FilePath -> String +defaultWriterName "-" = "html" -- no output file +defaultWriterName x = + case takeExtension (map toLower x) of + "" -> "markdown" -- empty extension + ".tex" -> "latex" + ".latex" -> "latex" + ".ltx" -> "latex" + ".context" -> "context" + ".ctx" -> "context" + ".rtf" -> "rtf" + ".rst" -> "rst" + ".s5" -> "s5" + ".native" -> "native" + ".json" -> "json" + ".txt" -> "markdown" + ".text" -> "markdown" + ".md" -> "markdown" + ".markdown" -> "markdown" + ".textile" -> "textile" + ".lhs" -> "markdown+lhs" + ".texi" -> "texinfo" + ".texinfo" -> "texinfo" + ".db" -> "docbook" + ".odt" -> "odt" + ".docx" -> "docx" + ".epub" -> "epub" + ".org" -> "org" + ".asciidoc" -> "asciidoc" + ".adoc" -> "asciidoc" + ".pdf" -> "latex" + ".fb2" -> "fb2" + ".opml" -> "opml" + ".icml" -> "icml" + ".tei.xml" -> "tei" + ".tei" -> "tei" + ['.',y] | y `elem` ['1'..'9'] -> "man" + _ -> "html" + +-- Transformations of a Pandoc document post-parsing: + +extractMedia :: MonadIO m => MediaBag -> FilePath -> Pandoc -> m Pandoc +extractMedia media dir d = + case [fp | (fp, _, _) <- mediaDirectory media] of + [] -> return d + fps -> do + extractMediaBag True dir media + return $ walk (adjustImagePath dir fps) d + +adjustImagePath :: FilePath -> [FilePath] -> Inline -> Inline +adjustImagePath dir paths (Image attr lab (src, tit)) + | src `elem` paths = Image attr lab (dir ++ "/" ++ src, tit) +adjustImagePath _ _ x = x + +applyTransforms :: Monad m => [Transform] -> Pandoc -> m Pandoc +applyTransforms transforms d = return $ foldr ($) d transforms + + -- First we check to see if a filter is found. If not, and if it's + -- not an absolute path, we check to see whether it's in `userdir/filters`. + -- If not, we leave it unchanged. +expandFilterPath :: MonadIO m => Maybe FilePath -> FilePath -> m FilePath +expandFilterPath mbDatadir fp = liftIO $ do + fpExists <- doesFileExist fp + if fpExists + then return fp + else case mbDatadir of + Just datadir | isRelative fp -> do + let filterPath = (datadir </> "filters" </> fp) + filterPathExists <- doesFileExist filterPath + if filterPathExists + then return filterPath + else return fp + _ -> return fp + +applyFilters :: MonadIO m + => Maybe FilePath -> [FilePath] -> [String] -> Pandoc -> m Pandoc +applyFilters mbDatadir filters args d = do + expandedFilters <- mapM (expandFilterPath mbDatadir) filters + foldrM ($) d $ map (flip externalFilter args) expandedFilters + +readSource :: MonadIO m => FilePath -> m String +readSource "-" = liftIO UTF8.getContents +readSource src = case parseURI src of + Just u | uriScheme u `elem` ["http:","https:"] -> + readURI src + | uriScheme u == "file:" -> + liftIO $ UTF8.readFile (uriPath u) + _ -> liftIO $ UTF8.readFile src + +readURI :: MonadIO m => FilePath -> m String +readURI src = do + res <- liftIO $ openURL src + case res of + Left e -> liftIO $ throwIO e + Right (bs,_) -> return $ UTF8.toString bs + +readFile' :: MonadIO m => FilePath -> m B.ByteString +readFile' "-" = liftIO $ B.getContents +readFile' f = liftIO $ B.readFile f + +writeFnBinary :: MonadIO m => FilePath -> B.ByteString -> m () +writeFnBinary "-" = liftIO . B.putStr +writeFnBinary f = liftIO . B.writeFile (UTF8.encodePath f) + +writerFn :: MonadIO m => FilePath -> String -> m () +writerFn "-" = liftIO . UTF8.putStr +writerFn f = liftIO . UTF8.writeFile f + +lookupHighlightStyle :: Maybe String -> IO (Maybe Style) +lookupHighlightStyle Nothing = return Nothing +lookupHighlightStyle (Just s) = + case lookup (map toLower s) highlightingStyles of + Just sty -> return (Just sty) + Nothing -> err 68 $ "Unknown highlight-style " ++ s + +-- | A list of functions, each transforming the options data structure +-- in response to a command-line option. +options :: [OptDescr (Opt -> IO Opt)] +options = + [ Option "fr" ["from","read"] + (ReqArg + (\arg opt -> return opt { optReader = Just arg }) + "FORMAT") + "" + + , Option "tw" ["to","write"] + (ReqArg + (\arg opt -> return opt { optWriter = Just arg }) + "FORMAT") + "" + + , Option "o" ["output"] + (ReqArg + (\arg opt -> return opt { optOutputFile = arg }) + "FILENAME") + "" -- "Name of output file" + + , Option "" ["data-dir"] + (ReqArg + (\arg opt -> return opt { optDataDir = Just arg }) + "DIRECTORY") -- "Directory containing pandoc data files." + "" + + , Option "" ["base-header-level"] + (ReqArg + (\arg opt -> + case safeRead arg of + Just t | t > 0 && t < 6 -> do + return opt{ optBaseHeaderLevel = t } + _ -> err 19 + "base-header-level must be 1-5") + "NUMBER") + "" -- "Headers base level" + + , Option "" ["indented-code-classes"] + (ReqArg + (\arg opt -> return opt { optIndentedCodeClasses = words $ + map (\c -> if c == ',' then ' ' else c) arg }) + "STRING") + "" -- "Classes (whitespace- or comma-separated) to use for indented code-blocks" + + , Option "F" ["filter"] + (ReqArg + (\arg opt -> return opt { optFilters = arg : optFilters opt }) + "PROGRAM") + "" -- "External JSON filter" + + , Option "p" ["preserve-tabs"] + (NoArg + (\opt -> return opt { optPreserveTabs = True })) + "" -- "Preserve tabs instead of converting to spaces" + + , Option "" ["tab-stop"] + (ReqArg + (\arg opt -> + case safeRead arg of + Just t | t > 0 -> return opt { optTabStop = t } + _ -> err 31 + "tab-stop must be a number greater than 0") + "NUMBER") + "" -- "Tab stop (default 4)" + + , Option "" ["track-changes"] + (ReqArg + (\arg opt -> do + action <- case arg of + "accept" -> return AcceptChanges + "reject" -> return RejectChanges + "all" -> return AllChanges + _ -> err 6 + ("Unknown option for track-changes: " ++ arg) + return opt { optTrackChanges = action }) + "accept|reject|all") + "" -- "Accepting or reject MS Word track-changes."" + + , Option "" ["file-scope"] + (NoArg + (\opt -> return opt { optFileScope = True })) + "" -- "Parse input files before combining" + + , Option "" ["extract-media"] + (ReqArg + (\arg opt -> + return opt { optExtractMedia = Just arg }) + "PATH") + "" -- "Directory to which to extract embedded media" + + , Option "s" ["standalone"] + (NoArg + (\opt -> return opt { optStandalone = True })) + "" -- "Include needed header and footer on output" + + , Option "" ["template"] + (ReqArg + (\arg opt -> + return opt{ optTemplate = Just arg, + optStandalone = True }) + "FILENAME") + "" -- "Use custom template" + + , Option "M" ["metadata"] + (ReqArg + (\arg opt -> do + let (key, val) = splitField arg + return opt{ optMetadata = (key, val) : optMetadata opt }) + "KEY[:VALUE]") + "" + + , Option "V" ["variable"] + (ReqArg + (\arg opt -> do + let (key, val) = splitField arg + return opt{ optVariables = (key, val) : optVariables opt }) + "KEY[:VALUE]") + "" + + , Option "D" ["print-default-template"] + (ReqArg + (\arg _ -> do + templ <- getDefaultTemplate Nothing arg + case templ of + Right t -> UTF8.hPutStr stdout t + Left e -> error $ show e + exitSuccess) + "FORMAT") + "" -- "Print default template for FORMAT" + + , Option "" ["print-default-data-file"] + (ReqArg + (\arg _ -> do + readDataFile Nothing arg >>= BS.hPutStr stdout + exitSuccess) + "FILE") + "" -- "Print default data file" + + , Option "" ["dpi"] + (ReqArg + (\arg opt -> + case safeRead arg of + Just t | t > 0 -> return opt { optDpi = t } + _ -> err 31 + "dpi must be a number greater than 0") + "NUMBER") + "" -- "Dpi (default 96)" + + , Option "" ["wrap"] + (ReqArg + (\arg opt -> + case safeRead ("Wrap" ++ uppercaseFirstLetter arg) of + Just o -> return opt { optWrapText = o } + Nothing -> err 77 "--wrap must be auto, none, or preserve") + "auto|none|preserve") + "" -- "Option for wrapping text in output" + + , Option "" ["columns"] + (ReqArg + (\arg opt -> + case safeRead arg of + Just t | t > 0 -> return opt { optColumns = t } + _ -> err 33 + "columns must be a number greater than 0") + "NUMBER") + "" -- "Length of line in characters" + + , Option "" ["toc", "table-of-contents"] + (NoArg + (\opt -> return opt { optTableOfContents = True })) + "" -- "Include table of contents" + + , Option "" ["toc-depth"] + (ReqArg + (\arg opt -> + case safeRead arg of + Just t | t >= 1 && t <= 6 -> + return opt { optTOCDepth = t } + _ -> err 57 + "TOC level must be a number between 1 and 6") + "NUMBER") + "" -- "Number of levels to include in TOC" + + , Option "" ["no-highlight"] + (NoArg + (\opt -> return opt { optHighlightStyle = Nothing })) + "" -- "Don't highlight source code" + + , Option "" ["highlight-style"] + (ReqArg + (\arg opt -> return opt{ optHighlightStyle = Just arg }) + "STYLE") + "" -- "Style for highlighted code" + + , Option "H" ["include-in-header"] + (ReqArg + (\arg opt -> return opt{ optIncludeInHeader = + arg : optIncludeInHeader opt, + optStandalone = True }) + "FILENAME") + "" -- "File to include at end of header (implies -s)" + + , Option "B" ["include-before-body"] + (ReqArg + (\arg opt -> return opt{ optIncludeBeforeBody = + arg : optIncludeBeforeBody opt, + optStandalone = True }) + "FILENAME") + "" -- "File to include before document body" + + , Option "A" ["include-after-body"] + (ReqArg + (\arg opt -> return opt{ optIncludeAfterBody = + arg : optIncludeAfterBody opt, + optStandalone = True }) + "FILENAME") + "" -- "File to include after document body" + + , Option "" ["self-contained"] + (NoArg + (\opt -> return opt { optSelfContained = True, + optStandalone = True })) + "" -- "Make slide shows include all the needed js and css" + + , Option "" ["html-q-tags"] + (NoArg + (\opt -> + return opt { optHtmlQTags = True })) + "" -- "Use <q> tags for quotes in HTML" + + , Option "" ["ascii"] + (NoArg + (\opt -> return opt { optAscii = True })) + "" -- "Use ascii characters only in HTML output" + + , Option "" ["reference-links"] + (NoArg + (\opt -> return opt { optReferenceLinks = True } )) + "" -- "Use reference links in parsing HTML" + + , Option "" ["reference-location"] + (ReqArg + (\arg opt -> do + action <- case arg of + "block" -> return EndOfBlock + "section" -> return EndOfSection + "document" -> return EndOfDocument + _ -> err 6 + ("Unknown option for reference-location: " ++ arg) + return opt { optReferenceLocation = action }) + "block|section|document") + "" -- "Accepting or reject MS Word track-changes."" + + , Option "" ["atx-headers"] + (NoArg + (\opt -> return opt { optSetextHeaders = False } )) + "" -- "Use atx-style headers for markdown" + + , Option "" ["top-level-division"] + (ReqArg + (\arg opt -> do + let tldName = "TopLevel" ++ uppercaseFirstLetter arg + case safeRead tldName of + Just tlDiv -> return opt { optTopLevelDivision = tlDiv } + _ -> err 76 ("Top-level division must be " ++ + "section, chapter, part, or default")) + "section|chapter|part") + "" -- "Use top-level division type in LaTeX, ConTeXt, DocBook" + + , Option "N" ["number-sections"] + (NoArg + (\opt -> return opt { optNumberSections = True })) + "" -- "Number sections in LaTeX" + + , Option "" ["number-offset"] + (ReqArg + (\arg opt -> + case safeRead ('[':arg ++ "]") of + Just ns -> return opt { optNumberOffset = ns, + optNumberSections = True } + _ -> err 57 "could not parse number-offset") + "NUMBERS") + "" -- "Starting number for sections, subsections, etc." + + , Option "" ["listings"] + (NoArg + (\opt -> return opt { optListings = True })) + "" -- "Use listings package for LaTeX code blocks" + + , Option "i" ["incremental"] + (NoArg + (\opt -> return opt { optIncremental = True })) + "" -- "Make list items display incrementally in Slidy/Slideous/S5" + + , Option "" ["slide-level"] + (ReqArg + (\arg opt -> + case safeRead arg of + Just t | t >= 1 && t <= 6 -> + return opt { optSlideLevel = Just t } + _ -> err 39 + "slide level must be a number between 1 and 6") + "NUMBER") + "" -- "Force header level for slides" + + , Option "" ["section-divs"] + (NoArg + (\opt -> return opt { optSectionDivs = True })) + "" -- "Put sections in div tags in HTML" + + , Option "" ["default-image-extension"] + (ReqArg + (\arg opt -> return opt { optDefaultImageExtension = arg }) + "extension") + "" -- "Default extension for extensionless images" + + , Option "" ["email-obfuscation"] + (ReqArg + (\arg opt -> do + method <- case arg of + "references" -> return ReferenceObfuscation + "javascript" -> return JavascriptObfuscation + "none" -> return NoObfuscation + _ -> err 6 + ("Unknown obfuscation method: " ++ arg) + return opt { optEmailObfuscation = method }) + "none|javascript|references") + "" -- "Method for obfuscating email in HTML" + + , Option "" ["id-prefix"] + (ReqArg + (\arg opt -> return opt { optIdentifierPrefix = arg }) + "STRING") + "" -- "Prefix to add to automatically generated HTML identifiers" + + , Option "T" ["title-prefix"] + (ReqArg + (\arg opt -> do + let newvars = ("title-prefix", arg) : optVariables opt + return opt { optVariables = newvars, + optStandalone = True }) + "STRING") + "" -- "String to prefix to HTML window title" + + , Option "c" ["css"] + (ReqArg + (\arg opt -> return opt{ optCss = arg : optCss opt }) + -- add new link to end, so it is included in proper order + "URL") + "" -- "Link to CSS style sheet" + + , Option "" ["reference-doc"] + (ReqArg + (\arg opt -> + return opt { optReferenceDoc = Just arg }) + "FILENAME") + "" -- "Path of custom reference doc" + + , Option "" ["epub-stylesheet"] + (ReqArg + (\arg opt -> return opt { optEpubStylesheet = Just arg }) + "FILENAME") + "" -- "Path of epub.css" + + , Option "" ["epub-cover-image"] + (ReqArg + (\arg opt -> + return opt { optVariables = + ("epub-cover-image", arg) : optVariables opt }) + "FILENAME") + "" -- "Path of epub cover image" + + , Option "" ["epub-metadata"] + (ReqArg + (\arg opt -> return opt { optEpubMetadata = Just arg }) + "FILENAME") + "" -- "Path of epub metadata file" + + , Option "" ["epub-embed-font"] + (ReqArg + (\arg opt -> + return opt{ optEpubFonts = arg : optEpubFonts opt }) + "FILE") + "" -- "Directory of fonts to embed" + + , Option "" ["epub-chapter-level"] + (ReqArg + (\arg opt -> + case safeRead arg of + Just t | t >= 1 && t <= 6 -> + return opt { optEpubChapterLevel = t } + _ -> err 59 + "chapter level must be a number between 1 and 6") + "NUMBER") + "" -- "Header level at which to split chapters in EPUB" + + , Option "" ["latex-engine"] + (ReqArg + (\arg opt -> do + let b = takeBaseName arg + if b `elem` ["pdflatex", "lualatex", "xelatex"] + then return opt { optLaTeXEngine = arg } + else err 45 "latex-engine must be pdflatex, lualatex, or xelatex.") + "PROGRAM") + "" -- "Name of latex program to use in generating PDF" + + , Option "" ["latex-engine-opt"] + (ReqArg + (\arg opt -> do + let oldArgs = optLaTeXEngineArgs opt + return opt { optLaTeXEngineArgs = arg : oldArgs }) + "STRING") + "" -- "Flags to pass to the LaTeX engine, all instances of this option are accumulated and used" + + , Option "" ["bibliography"] + (ReqArg + (\arg opt -> return opt{ optMetadata = + ("bibliography", arg) : optMetadata opt }) + "FILE") + "" + + , Option "" ["csl"] + (ReqArg + (\arg opt -> + return opt{ optMetadata = + ("csl", arg) : optMetadata opt }) + "FILE") + "" + + , Option "" ["citation-abbreviations"] + (ReqArg + (\arg opt -> + return opt{ optMetadata = + ("citation-abbreviations", arg): optMetadata opt }) + "FILE") + "" + + , Option "" ["natbib"] + (NoArg + (\opt -> return opt { optCiteMethod = Natbib })) + "" -- "Use natbib cite commands in LaTeX output" + + , Option "" ["biblatex"] + (NoArg + (\opt -> return opt { optCiteMethod = Biblatex })) + "" -- "Use biblatex cite commands in LaTeX output" + + , Option "m" ["latexmathml", "asciimathml"] + (OptArg + (\arg opt -> + return opt { optHTMLMathMethod = LaTeXMathML arg }) + "URL") + "" -- "Use LaTeXMathML script in html output" + + , Option "" ["mathml"] + (NoArg + (\opt -> + return opt { optHTMLMathMethod = MathML })) + "" -- "Use mathml for HTML math" + + , Option "" ["mimetex"] + (OptArg + (\arg opt -> do + let url' = case arg of + Just u -> u ++ "?" + Nothing -> "/cgi-bin/mimetex.cgi?" + return opt { optHTMLMathMethod = WebTeX url' }) + "URL") + "" -- "Use mimetex for HTML math" + + , Option "" ["webtex"] + (OptArg + (\arg opt -> do + let url' = fromMaybe "https://latex.codecogs.com/png.latex?" arg + return opt { optHTMLMathMethod = WebTeX url' }) + "URL") + "" -- "Use web service for HTML math" + + , Option "" ["jsmath"] + (OptArg + (\arg opt -> return opt { optHTMLMathMethod = JsMath arg}) + "URL") + "" -- "Use jsMath for HTML math" + + , Option "" ["mathjax"] + (OptArg + (\arg opt -> do + let url' = fromMaybe "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_CHTML-full" arg + return opt { optHTMLMathMethod = MathJax url'}) + "URL") + "" -- "Use MathJax for HTML math" + , Option "" ["katex"] + (OptArg + (\arg opt -> + return opt + { optKaTeXJS = + arg <|> Just "https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.6.0/katex.min.js"}) + "URL") + "" -- Use KaTeX for HTML Math + + , Option "" ["katex-stylesheet"] + (ReqArg + (\arg opt -> + return opt { optKaTeXStylesheet = Just arg }) + "URL") + "" -- Set the KaTeX Stylesheet location + + , Option "" ["gladtex"] + (NoArg + (\opt -> return opt { optHTMLMathMethod = GladTeX })) + "" -- "Use gladtex for HTML math" + + , Option "" ["trace"] + (NoArg + (\opt -> return opt { optVerbosity = DEBUG })) + "" -- "Turn on diagnostic tracing in readers." + + , Option "" ["dump-args"] + (NoArg + (\opt -> return opt { optDumpArgs = True })) + "" -- "Print output filename and arguments to stdout." + + , Option "" ["ignore-args"] + (NoArg + (\opt -> return opt { optIgnoreArgs = True })) + "" -- "Ignore command-line arguments." + + , Option "" ["verbose"] + (NoArg + (\opt -> return opt { optVerbosity = INFO })) + "" -- "Verbose diagnostic output." + + , Option "" ["quiet"] + (NoArg + (\opt -> return opt { optVerbosity = ERROR })) + "" -- "Suppress warnings." + + , Option "" ["fail-if-warnings"] + (NoArg + (\opt -> return opt { optFailIfWarnings = True })) + "" -- "Exit with error status if there were warnings." + + , Option "" ["log"] + (ReqArg + (\arg opt -> return opt{ optLogFile = Just arg }) + "FILE") + "" -- "Log messages in JSON format to this file." + + , Option "" ["bash-completion"] + (NoArg + (\_ -> do + ddir <- getDataDir + tpl <- readDataFileUTF8 Nothing "bash_completion.tpl" + let optnames (Option shorts longs _ _) = + map (\c -> ['-',c]) shorts ++ + map ("--" ++) longs + let allopts = unwords (concatMap optnames options) + UTF8.hPutStrLn stdout $ printf tpl allopts + (unwords readers'names) + (unwords writers'names) + (unwords $ map fst highlightingStyles) + ddir + exitSuccess )) + "" -- "Print bash completion script" + + , Option "" ["list-input-formats"] + (NoArg + (\_ -> do + mapM_ (UTF8.hPutStrLn stdout) readers'names + exitSuccess )) + "" + + , Option "" ["list-output-formats"] + (NoArg + (\_ -> do + mapM_ (UTF8.hPutStrLn stdout) writers'names + exitSuccess )) + "" + + , Option "" ["list-extensions"] + (NoArg + (\_ -> do + let showExt x = drop 4 (show x) ++ + if extensionEnabled x pandocExtensions + then " +" + else " -" + mapM_ (UTF8.hPutStrLn stdout . showExt) + ([minBound..maxBound] :: [Extension]) + exitSuccess )) + "" + + , Option "" ["list-highlight-languages"] + (NoArg + (\_ -> do + let langs = [ T.unpack (T.toLower (sShortname s)) + | s <- M.elems defaultSyntaxMap + , sShortname s `notElem` + [T.pack "Alert", T.pack "Alert_indent"] + ] + mapM_ (UTF8.hPutStrLn stdout) langs + exitSuccess )) + "" + + , Option "" ["list-highlight-styles"] + (NoArg + (\_ -> do + mapM_ (UTF8.hPutStrLn stdout) $ + map fst highlightingStyles + exitSuccess )) + "" + + , Option "v" ["version"] + (NoArg + (\_ -> do + prg <- getProgName + defaultDatadir <- E.catch + (getAppUserDataDirectory "pandoc") + (\e -> let _ = (e :: E.SomeException) + in return "") + UTF8.hPutStrLn stdout (prg ++ " " ++ pandocVersion ++ + compileInfo ++ "\nDefault user data directory: " ++ + defaultDatadir ++ copyrightMessage) + exitSuccess )) + "" -- "Print version" + + , Option "h" ["help"] + (NoArg + (\_ -> do + prg <- getProgName + UTF8.hPutStr stdout (usageMessage prg options) + exitSuccess )) + "" -- "Show help" + + ] + +-- Returns usage message +usageMessage :: String -> [OptDescr (Opt -> IO Opt)] -> String +usageMessage programName = usageInfo (programName ++ " [OPTIONS] [FILES]") + +copyrightMessage :: String +copyrightMessage = intercalate "\n" [ + "", + "Copyright (C) 2006-2017 John MacFarlane", + "Web: http://pandoc.org", + "This is free software; see the source for copying conditions.", + "There is no warranty, not even for merchantability or fitness", + "for a particular purpose." ] + +compileInfo :: String +compileInfo = + "\nCompiled with pandoc-types " ++ VERSION_pandoc_types ++ ", texmath " ++ + VERSION_texmath ++ ", skylighting " ++ VERSION_skylighting + +handleUnrecognizedOption :: String -> [String] -> [String] +handleUnrecognizedOption "--smart" = + (("--smart/-S has been removed. Use +smart or -smart extension instead.\n" ++ + "For example: pandoc -f markdown+smart -t markdown-smart.") :) +handleUnrecognizedOption "-S" = handleUnrecognizedOption "--smart" +handleUnrecognizedOption "--old-dashes" = + ("--old-dashes has been removed. Use +old_dashes extension instead." :) +handleUnrecognizedOption "--no-wrap" = + ("--no-wrap has been removed. Use --wrap=none instead." :) +handleUnrecognizedOption "--chapters" = + ("--chapters has been removed. Use --top-level-division=chapter instead." :) +handleUnrecognizedOption "--reference-docx" = + ("--reference-docx has been removed. Use --reference-doc instead." :) +handleUnrecognizedOption "--reference-odt" = + ("--reference-odt has been removed. Use --reference-doc instead." :) +handleUnrecognizedOption "--parse-raw" = + (("--parse-raw/-R has been removed. Use +raw_html or +raw_tex extension.\n") :) +handleUnrecognizedOption "-R" = handleUnrecognizedOption "--parse-raw" +handleUnrecognizedOption x = + (("Unknown option " ++ x ++ ".") :) + +uppercaseFirstLetter :: String -> String +uppercaseFirstLetter (c:cs) = toUpper c : cs +uppercaseFirstLetter [] = [] + +readers'names :: [String] +readers'names = sort (map fst (readers :: [(String, Reader PandocIO)])) + +writers'names :: [String] +writers'names = sort (map fst (writers :: [(String, Writer PandocIO)])) + +splitField :: String -> (String, String) +splitField s = + case break (`elem` ":=") s of + (k,_:v) -> (k,v) + (k,[]) -> (k,"true") + diff --git a/src/Text/Pandoc/Asciify.hs b/src/Text/Pandoc/Asciify.hs new file mode 100644 index 000000000..8eb1ba663 --- /dev/null +++ b/src/Text/Pandoc/Asciify.hs @@ -0,0 +1,422 @@ +{- +Copyright (C) 2013-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Asciify + Copyright : Copyright (C) 2013-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Function to convert accented latin letters to their unaccented +ascii equivalents (used in constructing HTML identifiers). +-} +module Text.Pandoc.Asciify (toAsciiChar) +where +import qualified Data.Map as M +import Data.Char (isAscii) + +toAsciiChar :: Char -> Maybe Char +toAsciiChar c | isAscii c = Just c + | otherwise = M.lookup c asciiMap + +asciiMap :: M.Map Char Char +asciiMap = M.fromList + [('\192','A') + ,('\193','A') + ,('\194','A') + ,('\195','A') + ,('\196','A') + ,('\197','A') + ,('\199','C') + ,('\200','E') + ,('\201','E') + ,('\202','E') + ,('\203','E') + ,('\204','I') + ,('\205','I') + ,('\206','I') + ,('\207','I') + ,('\209','N') + ,('\210','O') + ,('\211','O') + ,('\212','O') + ,('\213','O') + ,('\214','O') + ,('\217','U') + ,('\218','U') + ,('\219','U') + ,('\220','U') + ,('\221','Y') + ,('\224','a') + ,('\225','a') + ,('\226','a') + ,('\227','a') + ,('\228','a') + ,('\229','a') + ,('\231','c') + ,('\232','e') + ,('\233','e') + ,('\234','e') + ,('\235','e') + ,('\236','i') + ,('\237','i') + ,('\238','i') + ,('\239','i') + ,('\241','n') + ,('\242','o') + ,('\243','o') + ,('\244','o') + ,('\245','o') + ,('\246','o') + ,('\249','u') + ,('\250','u') + ,('\251','u') + ,('\252','u') + ,('\253','y') + ,('\255','y') + ,('\256','A') + ,('\257','a') + ,('\258','A') + ,('\259','a') + ,('\260','A') + ,('\261','a') + ,('\262','C') + ,('\263','c') + ,('\264','C') + ,('\265','c') + ,('\266','C') + ,('\267','c') + ,('\268','C') + ,('\269','c') + ,('\270','D') + ,('\271','d') + ,('\274','E') + ,('\275','e') + ,('\276','E') + ,('\277','e') + ,('\278','E') + ,('\279','e') + ,('\280','E') + ,('\281','e') + ,('\282','E') + ,('\283','e') + ,('\284','G') + ,('\285','g') + ,('\286','G') + ,('\287','g') + ,('\288','G') + ,('\289','g') + ,('\290','G') + ,('\291','g') + ,('\292','H') + ,('\293','h') + ,('\296','I') + ,('\297','i') + ,('\298','I') + ,('\299','i') + ,('\300','I') + ,('\301','i') + ,('\302','I') + ,('\303','i') + ,('\304','I') + ,('\308','J') + ,('\309','j') + ,('\310','K') + ,('\311','k') + ,('\313','L') + ,('\314','l') + ,('\315','L') + ,('\316','l') + ,('\317','L') + ,('\318','l') + ,('\323','N') + ,('\324','n') + ,('\325','N') + ,('\326','n') + ,('\327','N') + ,('\328','n') + ,('\332','O') + ,('\333','o') + ,('\334','O') + ,('\335','o') + ,('\336','O') + ,('\337','o') + ,('\340','R') + ,('\341','r') + ,('\342','R') + ,('\343','r') + ,('\344','R') + ,('\345','r') + ,('\346','S') + ,('\347','s') + ,('\348','S') + ,('\349','s') + ,('\350','S') + ,('\351','s') + ,('\352','S') + ,('\353','s') + ,('\354','T') + ,('\355','t') + ,('\356','T') + ,('\357','t') + ,('\360','U') + ,('\361','u') + ,('\362','U') + ,('\363','u') + ,('\364','U') + ,('\365','u') + ,('\366','U') + ,('\367','u') + ,('\368','U') + ,('\369','u') + ,('\370','U') + ,('\371','u') + ,('\372','W') + ,('\373','w') + ,('\374','Y') + ,('\375','y') + ,('\376','Y') + ,('\377','Z') + ,('\378','z') + ,('\379','Z') + ,('\380','z') + ,('\381','Z') + ,('\382','z') + ,('\416','O') + ,('\417','o') + ,('\431','U') + ,('\432','u') + ,('\461','A') + ,('\462','a') + ,('\463','I') + ,('\464','i') + ,('\465','O') + ,('\466','o') + ,('\467','U') + ,('\468','u') + ,('\486','G') + ,('\487','g') + ,('\488','K') + ,('\489','k') + ,('\490','O') + ,('\491','o') + ,('\496','j') + ,('\500','G') + ,('\501','g') + ,('\504','N') + ,('\505','n') + ,('\512','A') + ,('\513','a') + ,('\514','A') + ,('\515','a') + ,('\516','E') + ,('\517','e') + ,('\518','E') + ,('\519','e') + ,('\520','I') + ,('\521','i') + ,('\522','I') + ,('\523','i') + ,('\524','O') + ,('\525','o') + ,('\526','O') + ,('\527','o') + ,('\528','R') + ,('\529','r') + ,('\530','R') + ,('\531','r') + ,('\532','U') + ,('\533','u') + ,('\534','U') + ,('\535','u') + ,('\536','S') + ,('\537','s') + ,('\538','T') + ,('\539','t') + ,('\542','H') + ,('\543','h') + ,('\550','A') + ,('\551','a') + ,('\552','E') + ,('\553','e') + ,('\558','O') + ,('\559','o') + ,('\562','Y') + ,('\563','y') + ,('\894',';') + ,('\7680','A') + ,('\7681','a') + ,('\7682','B') + ,('\7683','b') + ,('\7684','B') + ,('\7685','b') + ,('\7686','B') + ,('\7687','b') + ,('\7690','D') + ,('\7691','d') + ,('\7692','D') + ,('\7693','d') + ,('\7694','D') + ,('\7695','d') + ,('\7696','D') + ,('\7697','d') + ,('\7698','D') + ,('\7699','d') + ,('\7704','E') + ,('\7705','e') + ,('\7706','E') + ,('\7707','e') + ,('\7710','F') + ,('\7711','f') + ,('\7712','G') + ,('\7713','g') + ,('\7714','H') + ,('\7715','h') + ,('\7716','H') + ,('\7717','h') + ,('\7718','H') + ,('\7719','h') + ,('\7720','H') + ,('\7721','h') + ,('\7722','H') + ,('\7723','h') + ,('\7724','I') + ,('\7725','i') + ,('\7728','K') + ,('\7729','k') + ,('\7730','K') + ,('\7731','k') + ,('\7732','K') + ,('\7733','k') + ,('\7734','L') + ,('\7735','l') + ,('\7738','L') + ,('\7739','l') + ,('\7740','L') + ,('\7741','l') + ,('\7742','M') + ,('\7743','m') + ,('\7744','M') + ,('\7745','m') + ,('\7746','M') + ,('\7747','m') + ,('\7748','N') + ,('\7749','n') + ,('\7750','N') + ,('\7751','n') + ,('\7752','N') + ,('\7753','n') + ,('\7754','N') + ,('\7755','n') + ,('\7764','P') + ,('\7765','p') + ,('\7766','P') + ,('\7767','p') + ,('\7768','R') + ,('\7769','r') + ,('\7770','R') + ,('\7771','r') + ,('\7774','R') + ,('\7775','r') + ,('\7776','S') + ,('\7777','s') + ,('\7778','S') + ,('\7779','s') + ,('\7786','T') + ,('\7787','t') + ,('\7788','T') + ,('\7789','t') + ,('\7790','T') + ,('\7791','t') + ,('\7792','T') + ,('\7793','t') + ,('\7794','U') + ,('\7795','u') + ,('\7796','U') + ,('\7797','u') + ,('\7798','U') + ,('\7799','u') + ,('\7804','V') + ,('\7805','v') + ,('\7806','V') + ,('\7807','v') + ,('\7808','W') + ,('\7809','w') + ,('\7810','W') + ,('\7811','w') + ,('\7812','W') + ,('\7813','w') + ,('\7814','W') + ,('\7815','w') + ,('\7816','W') + ,('\7817','w') + ,('\7818','X') + ,('\7819','x') + ,('\7820','X') + ,('\7821','x') + ,('\7822','Y') + ,('\7823','y') + ,('\7824','Z') + ,('\7825','z') + ,('\7826','Z') + ,('\7827','z') + ,('\7828','Z') + ,('\7829','z') + ,('\7830','h') + ,('\7831','t') + ,('\7832','w') + ,('\7833','y') + ,('\7840','A') + ,('\7841','a') + ,('\7842','A') + ,('\7843','a') + ,('\7864','E') + ,('\7865','e') + ,('\7866','E') + ,('\7867','e') + ,('\7868','E') + ,('\7869','e') + ,('\7880','I') + ,('\7881','i') + ,('\7882','I') + ,('\7883','i') + ,('\7884','O') + ,('\7885','o') + ,('\7886','O') + ,('\7887','o') + ,('\7908','U') + ,('\7909','u') + ,('\7910','U') + ,('\7911','u') + ,('\7922','Y') + ,('\7923','y') + ,('\7924','Y') + ,('\7925','y') + ,('\7926','Y') + ,('\7927','y') + ,('\7928','Y') + ,('\7929','y') + ,('\8175','`') + ,('\8490','K') + ,('\8800','=') + ,('\8814','<') + ,('\8815','>') + ] diff --git a/src/Text/Pandoc/CSS.hs b/src/Text/Pandoc/CSS.hs new file mode 100644 index 000000000..f479ed9d0 --- /dev/null +++ b/src/Text/Pandoc/CSS.hs @@ -0,0 +1,43 @@ +module Text.Pandoc.CSS ( foldOrElse + , pickStyleAttrProps + , pickStylesToKVs + ) +where + +import Text.Pandoc.Shared (trim) +import Text.Parsec +import Text.Parsec.String + +ruleParser :: Parser (String, String) +ruleParser = do + p <- many1 (noneOf ":") <* char ':' + v <- many1 (noneOf ":;") <* (optional $ char ';') <* spaces + return (trim p, trim v) + +styleAttrParser :: Parser [(String, String)] +styleAttrParser = many1 ruleParser + +orElse :: Eq a => a -> a -> a -> a +orElse v x y = if v == x then y else x + +foldOrElse :: Eq a => a -> [a] -> a +foldOrElse v xs = foldr (orElse v) v xs + +eitherToMaybe :: Either a b -> Maybe b +eitherToMaybe (Right x) = Just x +eitherToMaybe _ = Nothing + +-- | takes a list of keys/properties and a CSS string and +-- returns the corresponding key-value-pairs. +pickStylesToKVs :: [String] -> String -> [(String, String)] +pickStylesToKVs props styleAttr = + case parse styleAttrParser "" styleAttr of + Left _ -> [] + Right styles -> filter (\s -> fst s `elem` props) styles + +-- | takes a list of key/property synonyms and a CSS string and maybe +-- returns the value of the first match (in order of the supplied list) +pickStyleAttrProps :: [String] -> String -> Maybe String +pickStyleAttrProps lookupProps styleAttr = do + styles <- eitherToMaybe $ parse styleAttrParser "" styleAttr + foldOrElse Nothing $ map (flip lookup styles) lookupProps diff --git a/src/Text/Pandoc/Class.hs b/src/Text/Pandoc/Class.hs new file mode 100644 index 000000000..fb148666c --- /dev/null +++ b/src/Text/Pandoc/Class.hs @@ -0,0 +1,539 @@ +{-# LANGUAGE DeriveFunctor, DeriveDataTypeable, TypeSynonymInstances, +FlexibleInstances, GeneralizedNewtypeDeriving, FlexibleContexts #-} + +{- +Copyright (C) 2016 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Class + Copyright : Copyright (C) 2016 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Typeclass for pandoc readers and writers, allowing both IO and pure instances. +-} + +module Text.Pandoc.Class ( PandocMonad(..) + , CommonState(..) + , PureState(..) + , getPureState + , getsPureState + , putPureState + , modifyPureState + , getPOSIXTime + , getZonedTime + , readFileFromDirs + , report + , getLog + , setVerbosity + , getMediaBag + , setMediaBag + , insertMedia + , fetchItem + , getInputFiles + , getOutputFile + , PandocIO(..) + , PandocPure(..) + , FileTree(..) + , FileInfo(..) + , runIO + , runIOorExplode + , runPure + , withMediaBag + ) where + +import Prelude hiding (readFile) +import System.Random (StdGen, next, mkStdGen) +import qualified System.Random as IO (newStdGen) +import Codec.Archive.Zip (Archive, fromArchive, emptyArchive) +import Data.Unique (hashUnique) +import qualified Data.Unique as IO (newUnique) +import qualified Text.Pandoc.Shared as IO ( readDataFile + , openURL ) +import qualified Text.Pandoc.UTF8 as UTF8 +import Text.Pandoc.Compat.Time (UTCTime) +import Text.Pandoc.Logging +import Text.Parsec (ParsecT) +import qualified Text.Pandoc.Compat.Time as IO (getCurrentTime) +import Text.Pandoc.MIME (MimeType, getMimeType) +import Data.Time.Clock.POSIX ( utcTimeToPOSIXSeconds + , posixSecondsToUTCTime + , POSIXTime ) +import Data.Time.LocalTime (TimeZone, ZonedTime, utcToZonedTime, utc) +import Network.URI ( escapeURIString, nonStrictRelativeTo, + unEscapeString, parseURIReference, isAllowedInURI, + parseURI, URI(..) ) +import qualified Data.Time.LocalTime as IO (getCurrentTimeZone) +import Text.Pandoc.MediaBag (MediaBag, lookupMedia) +import qualified Text.Pandoc.MediaBag as MB +import qualified Data.ByteString as B +import qualified Data.ByteString.Lazy as BL +import qualified System.Environment as IO (lookupEnv) +import System.FilePath.Glob (match, compile) +import System.FilePath ((</>), takeExtension, dropExtension) +import qualified System.FilePath.Glob as IO (glob) +import qualified System.Directory as IO (getModificationTime) +import Control.Monad as M (fail) +import Control.Monad.Reader (ReaderT) +import Control.Monad.State +import Control.Monad.Except +import Control.Monad.Writer (WriterT) +import Control.Monad.RWS (RWST) +import Data.Word (Word8) +import Data.Default +import System.IO.Error +import System.IO (stderr) +import qualified Data.Map as M +import Text.Pandoc.Error +import Text.Printf (printf) + +class (Functor m, Applicative m, Monad m, MonadError PandocError m) + => PandocMonad m where + lookupEnv :: String -> m (Maybe String) + getCurrentTime :: m UTCTime + getCurrentTimeZone :: m TimeZone + newStdGen :: m StdGen + newUniqueHash :: m Int + openURL :: String -> m (B.ByteString, Maybe MimeType) + readFileLazy :: FilePath -> m BL.ByteString + readFileStrict :: FilePath -> m B.ByteString + readDataFile :: Maybe FilePath + -> FilePath + -> m B.ByteString + glob :: String -> m [FilePath] + getModificationTime :: FilePath -> m UTCTime + getCommonState :: m CommonState + putCommonState :: CommonState -> m () + + getsCommonState :: (CommonState -> a) -> m a + getsCommonState f = f <$> getCommonState + + modifyCommonState :: (CommonState -> CommonState) -> m () + modifyCommonState f = getCommonState >>= putCommonState . f + + logOutput :: LogMessage -> m () + +-- Functions defined for all PandocMonad instances + +setVerbosity :: PandocMonad m => Verbosity -> m () +setVerbosity verbosity = + modifyCommonState $ \st -> st{ stVerbosity = verbosity } + +getLog :: PandocMonad m => m [LogMessage] +getLog = reverse <$> getsCommonState stLog + +report :: PandocMonad m => LogMessage -> m () +report msg = do + verbosity <- getsCommonState stVerbosity + let level = messageVerbosity msg + when (level <= verbosity) $ do + logOutput msg + unless (level == DEBUG) $ + modifyCommonState $ \st -> st{ stLog = msg : stLog st } + +setMediaBag :: PandocMonad m => MediaBag -> m () +setMediaBag mb = modifyCommonState $ \st -> st{stMediaBag = mb} + +getMediaBag :: PandocMonad m => m MediaBag +getMediaBag = getsCommonState stMediaBag + +insertMedia :: PandocMonad m => FilePath -> Maybe MimeType -> BL.ByteString -> m () +insertMedia fp mime bs = do + mb <- getsCommonState stMediaBag + let mb' = MB.insertMedia fp mime bs mb + modifyCommonState $ \st -> st{stMediaBag = mb' } + +getInputFiles :: PandocMonad m => m (Maybe [FilePath]) +getInputFiles = getsCommonState stInputFiles + +getOutputFile :: PandocMonad m => m (Maybe FilePath) +getOutputFile = getsCommonState stOutputFile + +getPOSIXTime :: (PandocMonad m) => m POSIXTime +getPOSIXTime = utcTimeToPOSIXSeconds <$> getCurrentTime + +getZonedTime :: (PandocMonad m) => m ZonedTime +getZonedTime = do + t <- getCurrentTime + tz <- getCurrentTimeZone + return $ utcToZonedTime tz t + +-- | Read file, checking in any number of directories. +readFileFromDirs :: PandocMonad m => [FilePath] -> FilePath -> m (Maybe String) +readFileFromDirs [] _ = return Nothing +readFileFromDirs (d:ds) f = catchError + ((Just . UTF8.toStringLazy) <$> readFileLazy (d </> f)) + (\_ -> readFileFromDirs ds f) + +-- + +data CommonState = CommonState { stLog :: [LogMessage] + , stMediaBag :: MediaBag + , stInputFiles :: Maybe [FilePath] + , stOutputFile :: Maybe FilePath + , stVerbosity :: Verbosity + } + +instance Default CommonState where + def = CommonState { stLog = [] + , stMediaBag = mempty + , stInputFiles = Nothing + , stOutputFile = Nothing + , stVerbosity = WARNING + } + +runIO :: PandocIO a -> IO (Either PandocError a) +runIO ma = flip evalStateT def $ runExceptT $ unPandocIO ma + +withMediaBag :: PandocMonad m => m a -> m (a, MediaBag) +withMediaBag ma = ((,)) <$> ma <*> getMediaBag + +runIOorExplode :: PandocIO a -> IO a +runIOorExplode ma = runIO ma >>= handleError + +newtype PandocIO a = PandocIO { + unPandocIO :: ExceptT PandocError (StateT CommonState IO) a + } deriving ( MonadIO + , Functor + , Applicative + , Monad + , MonadError PandocError + ) + +instance PandocMonad PandocIO where + lookupEnv = liftIO . IO.lookupEnv + getCurrentTime = liftIO IO.getCurrentTime + getCurrentTimeZone = liftIO IO.getCurrentTimeZone + newStdGen = liftIO IO.newStdGen + newUniqueHash = hashUnique <$> (liftIO IO.newUnique) + openURL u = do + eitherRes <- liftIO $ (tryIOError $ IO.openURL u) + case eitherRes of + Right (Right res) -> return res + Right (Left _) -> throwError $ PandocFileReadError u + Left _ -> throwError $ PandocFileReadError u + readFileLazy s = do + eitherBS <- liftIO (tryIOError $ BL.readFile s) + case eitherBS of + Right bs -> return bs + Left _ -> throwError $ PandocFileReadError s + readFileStrict s = do + eitherBS <- liftIO (tryIOError $ B.readFile s) + case eitherBS of + Right bs -> return bs + Left _ -> throwError $ PandocFileReadError s + -- TODO: Make this more sensitive to the different sorts of failure + readDataFile mfp fname = do + eitherBS <- liftIO (tryIOError $ IO.readDataFile mfp fname) + case eitherBS of + Right bs -> return bs + Left _ -> throwError $ PandocFileReadError fname + glob = liftIO . IO.glob + getModificationTime fp = do + eitherMtime <- liftIO (tryIOError $ IO.getModificationTime fp) + case eitherMtime of + Right mtime -> return mtime + Left _ -> throwError $ PandocFileReadError fp + getCommonState = PandocIO $ lift get + putCommonState x = PandocIO $ lift $ put x + logOutput msg = + liftIO $ UTF8.hPutStrLn stderr $ printf "%-7s %s" + (show (messageVerbosity msg)) (showLogMessage msg) + +-- | Specialized version of parseURIReference that disallows +-- single-letter schemes. Reason: these are usually windows absolute +-- paths. +parseURIReference' :: String -> Maybe URI +parseURIReference' s = + case parseURIReference s of + Just u + | length (uriScheme u) > 2 -> Just u + | null (uriScheme u) -> Just u -- protocol-relative + _ -> Nothing + +-- | Fetch an image or other item from the local filesystem or the net. +-- Returns raw content and maybe mime type. +fetchItem :: PandocMonad m + => Maybe String + -> String + -> m (B.ByteString, Maybe MimeType) +fetchItem sourceURL s = do + mediabag <- getMediaBag + case lookupMedia s mediabag of + Just (mime, bs) -> return $ (BL.toStrict bs, Just mime) + Nothing -> downloadOrRead sourceURL s + +downloadOrRead :: PandocMonad m + => Maybe String + -> String + -> m (B.ByteString, Maybe MimeType) +downloadOrRead sourceURL s = do + case (sourceURL >>= parseURIReference' . + ensureEscaped, ensureEscaped s) of + (Just u, s') -> -- try fetching from relative path at source + case parseURIReference' s' of + Just u' -> openURL $ show $ u' `nonStrictRelativeTo` u + Nothing -> openURL s' -- will throw error + (Nothing, s'@('/':'/':_)) -> -- protocol-relative URI + case parseURIReference' s' of + Just u' -> openURL $ show $ u' `nonStrictRelativeTo` httpcolon + Nothing -> openURL s' -- will throw error + (Nothing, s') -> + case parseURI s' of -- requires absolute URI + -- We don't want to treat C:/ as a scheme: + Just u' | length (uriScheme u') > 2 -> openURL (show u') + Just u' | uriScheme u' == "file:" -> + readLocalFile $ dropWhile (=='/') (uriPath u') + _ -> readLocalFile fp -- get from local file system + where readLocalFile f = do + cont <- readFileStrict f + return (cont, mime) + httpcolon = URI{ uriScheme = "http:", + uriAuthority = Nothing, + uriPath = "", + uriQuery = "", + uriFragment = "" } + dropFragmentAndQuery = takeWhile (\c -> c /= '?' && c /= '#') + fp = unEscapeString $ dropFragmentAndQuery s + mime = case takeExtension fp of + ".gz" -> getMimeType $ dropExtension fp + ".svgz" -> getMimeType $ dropExtension fp ++ ".svg" + x -> getMimeType x + ensureEscaped = escapeURIString isAllowedInURI . map convertSlash + convertSlash '\\' = '/' + convertSlash x = x + +data PureState = PureState { stStdGen :: StdGen + , stWord8Store :: [Word8] -- should be + -- inifinite, + -- i.e. [1..] + , stUniqStore :: [Int] -- should be + -- inifinite and + -- contain every + -- element at most + -- once, e.g. [1..] + , stEnv :: [(String, String)] + , stTime :: UTCTime + , stTimeZone :: TimeZone + , stReferenceDocx :: Archive + , stReferenceODT :: Archive + , stFiles :: FileTree + , stUserDataDir :: FileTree + , stCabalDataDir :: FileTree + , stFontFiles :: [FilePath] + } + +instance Default PureState where + def = PureState { stStdGen = mkStdGen 1848 + , stWord8Store = [1..] + , stUniqStore = [1..] + , stEnv = [("USER", "pandoc-user")] + , stTime = posixSecondsToUTCTime 0 + , stTimeZone = utc + , stReferenceDocx = emptyArchive + , stReferenceODT = emptyArchive + , stFiles = mempty + , stUserDataDir = mempty + , stCabalDataDir = mempty + , stFontFiles = [] + } + + +getPureState :: PandocPure PureState +getPureState = PandocPure $ lift $ lift $ get + +getsPureState :: (PureState -> a) -> PandocPure a +getsPureState f = f <$> getPureState + +putPureState :: PureState -> PandocPure () +putPureState ps= PandocPure $ lift $ lift $ put ps + +modifyPureState :: (PureState -> PureState) -> PandocPure () +modifyPureState f = PandocPure $ lift $ lift $ modify f + + +data FileInfo = FileInfo { infoFileMTime :: UTCTime + , infoFileContents :: B.ByteString + } + +newtype FileTree = FileTree {unFileTree :: M.Map FilePath FileInfo} + deriving (Monoid) + +getFileInfo :: FilePath -> FileTree -> Maybe FileInfo +getFileInfo fp tree = M.lookup fp $ unFileTree tree + + +newtype PandocPure a = PandocPure { + unPandocPure :: ExceptT PandocError + (StateT CommonState (State PureState)) a + } deriving ( Functor + , Applicative + , Monad + , MonadError PandocError + ) + +runPure :: PandocPure a -> Either PandocError a +runPure x = flip evalState def $ + flip evalStateT def $ + runExceptT $ + unPandocPure x + +instance PandocMonad PandocPure where + lookupEnv s = do + env <- getsPureState stEnv + return (lookup s env) + + getCurrentTime = getsPureState stTime + + getCurrentTimeZone = getsPureState stTimeZone + + newStdGen = do + g <- getsPureState stStdGen + let (_, nxtGen) = next g + modifyPureState $ \st -> st { stStdGen = nxtGen } + return g + + newUniqueHash = do + uniqs <- getsPureState stUniqStore + case uniqs of + u : us -> do + modifyPureState $ \st -> st { stUniqStore = us } + return u + _ -> M.fail "uniq store ran out of elements" + openURL _ = throwError $ PandocSomeError "Cannot open URL in PandocPure" + readFileLazy fp = do + fps <- getsPureState stFiles + case infoFileContents <$> getFileInfo fp fps of + Just bs -> return (BL.fromStrict bs) + Nothing -> throwError $ PandocFileReadError fp + readFileStrict fp = do + fps <- getsPureState stFiles + case infoFileContents <$> getFileInfo fp fps of + Just bs -> return bs + Nothing -> throwError $ PandocFileReadError fp + readDataFile Nothing "reference.docx" = do + (B.concat . BL.toChunks . fromArchive) <$> getsPureState stReferenceDocx + readDataFile Nothing "reference.odt" = do + (B.concat . BL.toChunks . fromArchive) <$> getsPureState stReferenceODT + readDataFile Nothing fname = do + let fname' = if fname == "MANUAL.txt" then fname else "data" </> fname + readFileStrict fname' + readDataFile (Just userDir) fname = do + userDirFiles <- getsPureState stUserDataDir + case infoFileContents <$> (getFileInfo (userDir </> fname) userDirFiles) of + Just bs -> return bs + Nothing -> readDataFile Nothing fname + + glob s = do + fontFiles <- getsPureState stFontFiles + return (filter (match (compile s)) fontFiles) + + getModificationTime fp = do + fps <- getsPureState stFiles + case infoFileMTime <$> (getFileInfo fp fps) of + Just tm -> return tm + Nothing -> throwError $ PandocFileReadError fp + + getCommonState = PandocPure $ lift $ get + putCommonState x = PandocPure $ lift $ put x + + logOutput _msg = return () + +instance PandocMonad m => PandocMonad (ParsecT s st m) where + lookupEnv = lift . lookupEnv + getCurrentTime = lift getCurrentTime + getCurrentTimeZone = lift getCurrentTimeZone + newStdGen = lift newStdGen + newUniqueHash = lift newUniqueHash + openURL = lift . openURL + readFileLazy = lift . readFileLazy + readFileStrict = lift . readFileStrict + readDataFile mbuserdir = lift . readDataFile mbuserdir + glob = lift . glob + getModificationTime = lift . getModificationTime + getCommonState = lift getCommonState + putCommonState = lift . putCommonState + logOutput = lift . logOutput + +instance PandocMonad m => PandocMonad (ReaderT r m) where + lookupEnv = lift . lookupEnv + getCurrentTime = lift getCurrentTime + getCurrentTimeZone = lift getCurrentTimeZone + newStdGen = lift newStdGen + newUniqueHash = lift newUniqueHash + openURL = lift . openURL + readFileLazy = lift . readFileLazy + readFileStrict = lift . readFileStrict + readDataFile mbuserdir = lift . readDataFile mbuserdir + glob = lift . glob + getModificationTime = lift . getModificationTime + getCommonState = lift getCommonState + putCommonState = lift . putCommonState + logOutput = lift . logOutput + +instance (PandocMonad m, Monoid w) => PandocMonad (WriterT w m) where + lookupEnv = lift . lookupEnv + getCurrentTime = lift getCurrentTime + getCurrentTimeZone = lift getCurrentTimeZone + newStdGen = lift newStdGen + newUniqueHash = lift newUniqueHash + openURL = lift . openURL + readFileLazy = lift . readFileLazy + readFileStrict = lift . readFileStrict + readDataFile mbuserdir = lift . readDataFile mbuserdir + glob = lift . glob + getModificationTime = lift . getModificationTime + getCommonState = lift getCommonState + putCommonState = lift . putCommonState + logOutput = lift . logOutput + +instance (PandocMonad m, Monoid w) => PandocMonad (RWST r w st m) where + lookupEnv = lift . lookupEnv + getCurrentTime = lift getCurrentTime + getCurrentTimeZone = lift getCurrentTimeZone + newStdGen = lift newStdGen + newUniqueHash = lift newUniqueHash + openURL = lift . openURL + readFileLazy = lift . readFileLazy + readFileStrict = lift . readFileStrict + readDataFile mbuserdir = lift . readDataFile mbuserdir + glob = lift . glob + getModificationTime = lift . getModificationTime + getCommonState = lift getCommonState + putCommonState = lift . putCommonState + logOutput = lift . logOutput + +instance PandocMonad m => PandocMonad (StateT st m) where + lookupEnv = lift . lookupEnv + getCurrentTime = lift getCurrentTime + getCurrentTimeZone = lift getCurrentTimeZone + newStdGen = lift newStdGen + newUniqueHash = lift newUniqueHash + openURL = lift . openURL + readFileLazy = lift . readFileLazy + readFileStrict = lift . readFileStrict + readDataFile mbuserdir = lift . readDataFile mbuserdir + glob = lift . glob + getModificationTime = lift . getModificationTime + getCommonState = lift getCommonState + putCommonState = lift . putCommonState + logOutput = lift . logOutput + diff --git a/src/Text/Pandoc/Compat/Time.hs b/src/Text/Pandoc/Compat/Time.hs new file mode 100644 index 000000000..b1cde82a4 --- /dev/null +++ b/src/Text/Pandoc/Compat/Time.hs @@ -0,0 +1,30 @@ +{-# LANGUAGE CPP #-} + +{- +This compatibility module is needed because, in time 1.5, the +`defaultTimeLocale` function was moved from System.Locale (in the +old-locale library) into Data.Time. + +We support both behaviors because time 1.4 is a boot library for GHC +7.8. time 1.5 is a boot library for GHC 7.10. + +When support is dropped for GHC 7.8, this module may be obsoleted. +-} + +#if MIN_VERSION_time(1,5,0) +module Text.Pandoc.Compat.Time ( + module Data.Time +) +where +import Data.Time + +#else +module Text.Pandoc.Compat.Time ( + module Data.Time, + defaultTimeLocale +) +where +import Data.Time +import System.Locale ( defaultTimeLocale ) + +#endif diff --git a/src/Text/Pandoc/Data.hsb b/src/Text/Pandoc/Data.hsb new file mode 100644 index 000000000..8786647c5 --- /dev/null +++ b/src/Text/Pandoc/Data.hsb @@ -0,0 +1,15 @@ +{-# LANGUAGE OverloadedStrings #-} +-- to be processed using hsb2hs +module Text.Pandoc.Data (dataFiles) where +import qualified Data.ByteString as B +import System.FilePath (splitDirectories) +import qualified System.FilePath.Posix as Posix + +-- We ensure that the data files are stored using Posix +-- path separators (/), even on Windows. +dataFiles :: [(FilePath, B.ByteString)] +dataFiles = map (\(fp, contents) -> + (Posix.joinPath (splitDirectories fp), contents)) dataFiles' + +dataFiles' :: [(FilePath, B.ByteString)] +dataFiles' = ("MANUAL.txt", %blob "MANUAL.txt") : %blobs "data" diff --git a/src/Text/Pandoc/Emoji.hs b/src/Text/Pandoc/Emoji.hs new file mode 100644 index 000000000..c9f368abc --- /dev/null +++ b/src/Text/Pandoc/Emoji.hs @@ -0,0 +1,906 @@ +{- +Copyright (C) 2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Emoji + Copyright : Copyright (C) 2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Emoji symbol lookup from canonical string identifier. +-} +module Text.Pandoc.Emoji ( emojis ) where +import qualified Data.Map as M + +emojis :: M.Map String String +emojis = M.fromList + [("+1","\128077") + ,("-1","\128078") + ,("100","\128175") + ,("1234","\128290") + ,("8ball","\127921") + ,("a","\127344\65039") + ,("ab","\127374") + ,("abc","\128292") + ,("abcd","\128289") + ,("accept","\127569") + ,("aerial_tramway","\128673") + ,("airplane","\9992\65039") + ,("alarm_clock","\9200") + ,("alien","\128125") + ,("ambulance","\128657") + ,("anchor","\9875") + ,("angel","\128124") + ,("anger","\128162") + ,("angry","\128544") + ,("anguished","\128551") + ,("ant","\128028") + ,("apple","\127822") + ,("aquarius","\9810") + ,("aries","\9800") + ,("arrow_backward","\9664\65039") + ,("arrow_double_down","\9196") + ,("arrow_double_up","\9195") + ,("arrow_down","\11015\65039") + ,("arrow_down_small","\128317") + ,("arrow_forward","\9654\65039") + ,("arrow_heading_down","\10549\65039") + ,("arrow_heading_up","\10548\65039") + ,("arrow_left","\11013\65039") + ,("arrow_lower_left","\8601\65039") + ,("arrow_lower_right","\8600\65039") + ,("arrow_right","\10145\65039") + ,("arrow_right_hook","\8618\65039") + ,("arrow_up","\11014\65039") + ,("arrow_up_down","\8597\65039") + ,("arrow_up_small","\128316") + ,("arrow_upper_left","\8598\65039") + ,("arrow_upper_right","\8599\65039") + ,("arrows_clockwise","\128259") + ,("arrows_counterclockwise","\128260") + ,("art","\127912") + ,("articulated_lorry","\128667") + ,("astonished","\128562") + ,("athletic_shoe","\128095") + ,("atm","\127975") + ,("b","\127345\65039") + ,("baby","\128118") + ,("baby_bottle","\127868") + ,("baby_chick","\128036") + ,("baby_symbol","\128700") + ,("back","\128281") + ,("baggage_claim","\128708") + ,("balloon","\127880") + ,("ballot_box_with_check","\9745\65039") + ,("bamboo","\127885") + ,("banana","\127820") + ,("bangbang","\8252\65039") + ,("bank","\127974") + ,("bar_chart","\128202") + ,("barber","\128136") + ,("baseball","\9918\65039") + ,("basketball","\127936") + ,("bath","\128704") + ,("bathtub","\128705") + ,("battery","\128267") + ,("bear","\128059") + ,("bee","\128029") + ,("beer","\127866") + ,("beers","\127867") + ,("beetle","\128030") + ,("beginner","\128304") + ,("bell","\128276") + ,("bento","\127857") + ,("bicyclist","\128692") + ,("bike","\128690") + ,("bikini","\128089") + ,("bird","\128038") + ,("birthday","\127874") + ,("black_circle","\9899") + ,("black_joker","\127183") + ,("black_large_square","\11035") + ,("black_medium_small_square","\9726") + ,("black_medium_square","\9724\65039") + ,("black_nib","\10002\65039") + ,("black_small_square","\9642\65039") + ,("black_square_button","\128306") + ,("blossom","\127804") + ,("blowfish","\128033") + ,("blue_book","\128216") + ,("blue_car","\128665") + ,("blue_heart","\128153") + ,("blush","\128522") + ,("boar","\128023") + ,("boat","\9973") + ,("bomb","\128163") + ,("book","\128214") + ,("bookmark","\128278") + ,("bookmark_tabs","\128209") + ,("books","\128218") + ,("boom","\128165") + ,("boot","\128098") + ,("bouquet","\128144") + ,("bow","\128583") + ,("bowling","\127923") + ,("boy","\128102") + ,("bread","\127838") + ,("bride_with_veil","\128112") + ,("bridge_at_night","\127753") + ,("briefcase","\128188") + ,("broken_heart","\128148") + ,("bug","\128027") + ,("bulb","\128161") + ,("bullettrain_front","\128645") + ,("bullettrain_side","\128644") + ,("bus","\128652") + ,("busstop","\128655") + ,("bust_in_silhouette","\128100") + ,("busts_in_silhouette","\128101") + ,("cactus","\127797") + ,("cake","\127856") + ,("calendar","\128198") + ,("calling","\128242") + ,("camel","\128043") + ,("camera","\128247") + ,("cancer","\9803") + ,("candy","\127852") + ,("capital_abcd","\128288") + ,("capricorn","\9809") + ,("car","\128663") + ,("card_index","\128199") + ,("carousel_horse","\127904") + ,("cat","\128049") + ,("cat2","\128008") + ,("cd","\128191") + ,("chart","\128185") + ,("chart_with_downwards_trend","\128201") + ,("chart_with_upwards_trend","\128200") + ,("checkered_flag","\127937") + ,("cherries","\127826") + ,("cherry_blossom","\127800") + ,("chestnut","\127792") + ,("chicken","\128020") + ,("children_crossing","\128696") + ,("chocolate_bar","\127851") + ,("christmas_tree","\127876") + ,("church","\9962") + ,("cinema","\127910") + ,("circus_tent","\127914") + ,("city_sunrise","\127751") + ,("city_sunset","\127750") + ,("cl","\127377") + ,("clap","\128079") + ,("clapper","\127916") + ,("clipboard","\128203") + ,("clock1","\128336") + ,("clock10","\128345") + ,("clock1030","\128357") + ,("clock11","\128346") + ,("clock1130","\128358") + ,("clock12","\128347") + ,("clock1230","\128359") + ,("clock130","\128348") + ,("clock2","\128337") + ,("clock230","\128349") + ,("clock3","\128338") + ,("clock330","\128350") + ,("clock4","\128339") + ,("clock430","\128351") + ,("clock5","\128340") + ,("clock530","\128352") + ,("clock6","\128341") + ,("clock630","\128353") + ,("clock7","\128342") + ,("clock730","\128354") + ,("clock8","\128343") + ,("clock830","\128355") + ,("clock9","\128344") + ,("clock930","\128356") + ,("closed_book","\128213") + ,("closed_lock_with_key","\128272") + ,("closed_umbrella","\127746") + ,("cloud","\9729\65039") + ,("clubs","\9827\65039") + ,("cn","\127464\127475") + ,("cocktail","\127864") + ,("coffee","\9749") + ,("cold_sweat","\128560") + ,("collision","\128165") + ,("computer","\128187") + ,("confetti_ball","\127882") + ,("confounded","\128534") + ,("confused","\128533") + ,("congratulations","\12951\65039") + ,("construction","\128679") + ,("construction_worker","\128119") + ,("convenience_store","\127978") + ,("cookie","\127850") + ,("cool","\127378") + ,("cop","\128110") + ,("copyright","\169\65039") + ,("corn","\127805") + ,("couple","\128107") + ,("couple_with_heart","\128145") + ,("couplekiss","\128143") + ,("cow","\128046") + ,("cow2","\128004") + ,("credit_card","\128179") + ,("crescent_moon","\127769") + ,("crocodile","\128010") + ,("crossed_flags","\127884") + ,("crown","\128081") + ,("cry","\128546") + ,("crying_cat_face","\128575") + ,("crystal_ball","\128302") + ,("cupid","\128152") + ,("curly_loop","\10160") + ,("currency_exchange","\128177") + ,("curry","\127835") + ,("custard","\127854") + ,("customs","\128707") + ,("cyclone","\127744") + ,("dancer","\128131") + ,("dancers","\128111") + ,("dango","\127841") + ,("dart","\127919") + ,("dash","\128168") + ,("date","\128197") + ,("de","\127465\127466") + ,("deciduous_tree","\127795") + ,("department_store","\127980") + ,("diamond_shape_with_a_dot_inside","\128160") + ,("diamonds","\9830\65039") + ,("disappointed","\128542") + ,("disappointed_relieved","\128549") + ,("dizzy","\128171") + ,("dizzy_face","\128565") + ,("do_not_litter","\128687") + ,("dog","\128054") + ,("dog2","\128021") + ,("dollar","\128181") + ,("dolls","\127886") + ,("dolphin","\128044") + ,("door","\128682") + ,("doughnut","\127849") + ,("dragon","\128009") + ,("dragon_face","\128050") + ,("dress","\128087") + ,("dromedary_camel","\128042") + ,("droplet","\128167") + ,("dvd","\128192") + ,("e-mail","\128231") + ,("ear","\128066") + ,("ear_of_rice","\127806") + ,("earth_africa","\127757") + ,("earth_americas","\127758") + ,("earth_asia","\127759") + ,("egg","\127859") + ,("eggplant","\127814") + ,("eight","8\65039\8419") + ,("eight_pointed_black_star","\10036\65039") + ,("eight_spoked_asterisk","\10035\65039") + ,("electric_plug","\128268") + ,("elephant","\128024") + ,("email","\9993\65039") + ,("end","\128282") + ,("envelope","\9993\65039") + ,("envelope_with_arrow","\128233") + ,("es","\127466\127480") + ,("euro","\128182") + ,("european_castle","\127984") + ,("european_post_office","\127972") + ,("evergreen_tree","\127794") + ,("exclamation","\10071") + ,("expressionless","\128529") + ,("eyeglasses","\128083") + ,("eyes","\128064") + ,("facepunch","\128074") + ,("factory","\127981") + ,("fallen_leaf","\127810") + ,("family","\128106") + ,("fast_forward","\9193") + ,("fax","\128224") + ,("fearful","\128552") + ,("feet","\128062") + ,("ferris_wheel","\127905") + ,("file_folder","\128193") + ,("fire","\128293") + ,("fire_engine","\128658") + ,("fireworks","\127878") + ,("first_quarter_moon","\127763") + ,("first_quarter_moon_with_face","\127771") + ,("fish","\128031") + ,("fish_cake","\127845") + ,("fishing_pole_and_fish","\127907") + ,("fist","\9994") + ,("five","5\65039\8419") + ,("flags","\127887") + ,("flashlight","\128294") + ,("flipper","\128044") + ,("floppy_disk","\128190") + ,("flower_playing_cards","\127924") + ,("flushed","\128563") + ,("foggy","\127745") + ,("football","\127944") + ,("footprints","\128099") + ,("fork_and_knife","\127860") + ,("fountain","\9970") + ,("four","4\65039\8419") + ,("four_leaf_clover","\127808") + ,("fr","\127467\127479") + ,("free","\127379") + ,("fried_shrimp","\127844") + ,("fries","\127839") + ,("frog","\128056") + ,("frowning","\128550") + ,("fuelpump","\9981") + ,("full_moon","\127765") + ,("full_moon_with_face","\127773") + ,("game_die","\127922") + ,("gb","\127468\127463") + ,("gem","\128142") + ,("gemini","\9802") + ,("ghost","\128123") + ,("gift","\127873") + ,("gift_heart","\128157") + ,("girl","\128103") + ,("globe_with_meridians","\127760") + ,("goat","\128016") + ,("golf","\9971") + ,("grapes","\127815") + ,("green_apple","\127823") + ,("green_book","\128215") + ,("green_heart","\128154") + ,("grey_exclamation","\10069") + ,("grey_question","\10068") + ,("grimacing","\128556") + ,("grin","\128513") + ,("grinning","\128512") + ,("guardsman","\128130") + ,("guitar","\127928") + ,("gun","\128299") + ,("haircut","\128135") + ,("hamburger","\127828") + ,("hammer","\128296") + ,("hamster","\128057") + ,("hand","\9995") + ,("handbag","\128092") + ,("hankey","\128169") + ,("hash","#\65039\8419") + ,("hatched_chick","\128037") + ,("hatching_chick","\128035") + ,("headphones","\127911") + ,("hear_no_evil","\128585") + ,("heart","\10084\65039") + ,("heart_decoration","\128159") + ,("heart_eyes","\128525") + ,("heart_eyes_cat","\128571") + ,("heartbeat","\128147") + ,("heartpulse","\128151") + ,("hearts","\9829\65039") + ,("heavy_check_mark","\10004\65039") + ,("heavy_division_sign","\10135") + ,("heavy_dollar_sign","\128178") + ,("heavy_exclamation_mark","\10071") + ,("heavy_minus_sign","\10134") + ,("heavy_multiplication_x","\10006\65039") + ,("heavy_plus_sign","\10133") + ,("helicopter","\128641") + ,("herb","\127807") + ,("hibiscus","\127802") + ,("high_brightness","\128262") + ,("high_heel","\128096") + ,("hocho","\128298") + ,("honey_pot","\127855") + ,("honeybee","\128029") + ,("horse","\128052") + ,("horse_racing","\127943") + ,("hospital","\127973") + ,("hotel","\127976") + ,("hotsprings","\9832\65039") + ,("hourglass","\8987") + ,("hourglass_flowing_sand","\9203") + ,("house","\127968") + ,("house_with_garden","\127969") + ,("hushed","\128559") + ,("ice_cream","\127848") + ,("icecream","\127846") + ,("id","\127380") + ,("ideograph_advantage","\127568") + ,("imp","\128127") + ,("inbox_tray","\128229") + ,("incoming_envelope","\128232") + ,("information_desk_person","\128129") + ,("information_source","\8505\65039") + ,("innocent","\128519") + ,("interrobang","\8265\65039") + ,("iphone","\128241") + ,("it","\127470\127481") + ,("izakaya_lantern","\127982") + ,("jack_o_lantern","\127875") + ,("japan","\128510") + ,("japanese_castle","\127983") + ,("japanese_goblin","\128122") + ,("japanese_ogre","\128121") + ,("jeans","\128086") + ,("joy","\128514") + ,("joy_cat","\128569") + ,("jp","\127471\127477") + ,("key","\128273") + ,("keycap_ten","\128287") + ,("kimono","\128088") + ,("kiss","\128139") + ,("kissing","\128535") + ,("kissing_cat","\128573") + ,("kissing_closed_eyes","\128538") + ,("kissing_heart","\128536") + ,("kissing_smiling_eyes","\128537") + ,("knife","\128298") + ,("koala","\128040") + ,("koko","\127489") + ,("kr","\127472\127479") + ,("lantern","\127982") + ,("large_blue_circle","\128309") + ,("large_blue_diamond","\128311") + ,("large_orange_diamond","\128310") + ,("last_quarter_moon","\127767") + ,("last_quarter_moon_with_face","\127772") + ,("laughing","\128518") + ,("leaves","\127811") + ,("ledger","\128210") + ,("left_luggage","\128709") + ,("left_right_arrow","\8596\65039") + ,("leftwards_arrow_with_hook","\8617\65039") + ,("lemon","\127819") + ,("leo","\9804") + ,("leopard","\128006") + ,("libra","\9806") + ,("light_rail","\128648") + ,("link","\128279") + ,("lips","\128068") + ,("lipstick","\128132") + ,("lock","\128274") + ,("lock_with_ink_pen","\128271") + ,("lollipop","\127853") + ,("loop","\10175") + ,("loud_sound","\128266") + ,("loudspeaker","\128226") + ,("love_hotel","\127977") + ,("love_letter","\128140") + ,("low_brightness","\128261") + ,("m","\9410\65039") + ,("mag","\128269") + ,("mag_right","\128270") + ,("mahjong","\126980") + ,("mailbox","\128235") + ,("mailbox_closed","\128234") + ,("mailbox_with_mail","\128236") + ,("mailbox_with_no_mail","\128237") + ,("man","\128104") + ,("man_with_gua_pi_mao","\128114") + ,("man_with_turban","\128115") + ,("mans_shoe","\128094") + ,("maple_leaf","\127809") + ,("mask","\128567") + ,("massage","\128134") + ,("meat_on_bone","\127830") + ,("mega","\128227") + ,("melon","\127816") + ,("memo","\128221") + ,("mens","\128697") + ,("metro","\128647") + ,("microphone","\127908") + ,("microscope","\128300") + ,("milky_way","\127756") + ,("minibus","\128656") + ,("minidisc","\128189") + ,("mobile_phone_off","\128244") + ,("money_with_wings","\128184") + ,("moneybag","\128176") + ,("monkey","\128018") + ,("monkey_face","\128053") + ,("monorail","\128669") + ,("moon","\127764") + ,("mortar_board","\127891") + ,("mount_fuji","\128507") + ,("mountain_bicyclist","\128693") + ,("mountain_cableway","\128672") + ,("mountain_railway","\128670") + ,("mouse","\128045") + ,("mouse2","\128001") + ,("movie_camera","\127909") + ,("moyai","\128511") + ,("muscle","\128170") + ,("mushroom","\127812") + ,("musical_keyboard","\127929") + ,("musical_note","\127925") + ,("musical_score","\127932") + ,("mute","\128263") + ,("nail_care","\128133") + ,("name_badge","\128219") + ,("necktie","\128084") + ,("negative_squared_cross_mark","\10062") + ,("neutral_face","\128528") + ,("new","\127381") + ,("new_moon","\127761") + ,("new_moon_with_face","\127770") + ,("newspaper","\128240") + ,("ng","\127382") + ,("night_with_stars","\127747") + ,("nine","9\65039\8419") + ,("no_bell","\128277") + ,("no_bicycles","\128691") + ,("no_entry","\9940") + ,("no_entry_sign","\128683") + ,("no_good","\128581") + ,("no_mobile_phones","\128245") + ,("no_mouth","\128566") + ,("no_pedestrians","\128695") + ,("no_smoking","\128685") + ,("non-potable_water","\128689") + ,("nose","\128067") + ,("notebook","\128211") + ,("notebook_with_decorative_cover","\128212") + ,("notes","\127926") + ,("nut_and_bolt","\128297") + ,("o","\11093") + ,("o2","\127358\65039") + ,("ocean","\127754") + ,("octopus","\128025") + ,("oden","\127842") + ,("office","\127970") + ,("ok","\127383") + ,("ok_hand","\128076") + ,("ok_woman","\128582") + ,("older_man","\128116") + ,("older_woman","\128117") + ,("on","\128283") + ,("oncoming_automobile","\128664") + ,("oncoming_bus","\128653") + ,("oncoming_police_car","\128660") + ,("oncoming_taxi","\128662") + ,("one","1\65039\8419") + ,("open_book","\128214") + ,("open_file_folder","\128194") + ,("open_hands","\128080") + ,("open_mouth","\128558") + ,("ophiuchus","\9934") + ,("orange_book","\128217") + ,("outbox_tray","\128228") + ,("ox","\128002") + ,("package","\128230") + ,("page_facing_up","\128196") + ,("page_with_curl","\128195") + ,("pager","\128223") + ,("palm_tree","\127796") + ,("panda_face","\128060") + ,("paperclip","\128206") + ,("parking","\127359\65039") + ,("part_alternation_mark","\12349\65039") + ,("partly_sunny","\9925") + ,("passport_control","\128706") + ,("paw_prints","\128062") + ,("peach","\127825") + ,("pear","\127824") + ,("pencil","\128221") + ,("pencil2","\9999\65039") + ,("penguin","\128039") + ,("pensive","\128532") + ,("performing_arts","\127917") + ,("persevere","\128547") + ,("person_frowning","\128589") + ,("person_with_blond_hair","\128113") + ,("person_with_pouting_face","\128590") + ,("phone","\9742\65039") + ,("pig","\128055") + ,("pig2","\128022") + ,("pig_nose","\128061") + ,("pill","\128138") + ,("pineapple","\127821") + ,("pisces","\9811") + ,("pizza","\127829") + ,("point_down","\128071") + ,("point_left","\128072") + ,("point_right","\128073") + ,("point_up","\9757\65039") + ,("point_up_2","\128070") + ,("police_car","\128659") + ,("poodle","\128041") + ,("poop","\128169") + ,("post_office","\127971") + ,("postal_horn","\128239") + ,("postbox","\128238") + ,("potable_water","\128688") + ,("pouch","\128093") + ,("poultry_leg","\127831") + ,("pound","\128183") + ,("pouting_cat","\128574") + ,("pray","\128591") + ,("princess","\128120") + ,("punch","\128074") + ,("purple_heart","\128156") + ,("purse","\128091") + ,("pushpin","\128204") + ,("put_litter_in_its_place","\128686") + ,("question","\10067") + ,("rabbit","\128048") + ,("rabbit2","\128007") + ,("racehorse","\128014") + ,("radio","\128251") + ,("radio_button","\128280") + ,("rage","\128545") + ,("railway_car","\128643") + ,("rainbow","\127752") + ,("raised_hand","\9995") + ,("raised_hands","\128588") + ,("raising_hand","\128587") + ,("ram","\128015") + ,("ramen","\127836") + ,("rat","\128000") + ,("recycle","\9851\65039") + ,("red_car","\128663") + ,("red_circle","\128308") + ,("registered","\174\65039") + ,("relaxed","\9786\65039") + ,("relieved","\128524") + ,("repeat","\128257") + ,("repeat_one","\128258") + ,("restroom","\128699") + ,("revolving_hearts","\128158") + ,("rewind","\9194") + ,("ribbon","\127872") + ,("rice","\127834") + ,("rice_ball","\127833") + ,("rice_cracker","\127832") + ,("rice_scene","\127889") + ,("ring","\128141") + ,("rocket","\128640") + ,("roller_coaster","\127906") + ,("rooster","\128019") + ,("rose","\127801") + ,("rotating_light","\128680") + ,("round_pushpin","\128205") + ,("rowboat","\128675") + ,("ru","\127479\127482") + ,("rugby_football","\127945") + ,("runner","\127939") + ,("running","\127939") + ,("running_shirt_with_sash","\127933") + ,("sa","\127490\65039") + ,("sagittarius","\9808") + ,("sailboat","\9973") + ,("sake","\127862") + ,("sandal","\128097") + ,("santa","\127877") + ,("satellite","\128225") + ,("satisfied","\128518") + ,("saxophone","\127927") + ,("school","\127979") + ,("school_satchel","\127890") + ,("scissors","\9986\65039") + ,("scorpius","\9807") + ,("scream","\128561") + ,("scream_cat","\128576") + ,("scroll","\128220") + ,("seat","\128186") + ,("secret","\12953\65039") + ,("see_no_evil","\128584") + ,("seedling","\127793") + ,("seven","7\65039\8419") + ,("shaved_ice","\127847") + ,("sheep","\128017") + ,("shell","\128026") + ,("ship","\128674") + ,("shirt","\128085") + ,("shit","\128169") + ,("shoe","\128094") + ,("shower","\128703") + ,("signal_strength","\128246") + ,("six","6\65039\8419") + ,("six_pointed_star","\128303") + ,("ski","\127935") + ,("skull","\128128") + ,("sleeping","\128564") + ,("sleepy","\128554") + ,("slot_machine","\127920") + ,("small_blue_diamond","\128313") + ,("small_orange_diamond","\128312") + ,("small_red_triangle","\128314") + ,("small_red_triangle_down","\128315") + ,("smile","\128516") + ,("smile_cat","\128568") + ,("smiley","\128515") + ,("smiley_cat","\128570") + ,("smiling_imp","\128520") + ,("smirk","\128527") + ,("smirk_cat","\128572") + ,("smoking","\128684") + ,("snail","\128012") + ,("snake","\128013") + ,("snowboarder","\127938") + ,("snowflake","\10052\65039") + ,("snowman","\9924") + ,("sob","\128557") + ,("soccer","\9917") + ,("soon","\128284") + ,("sos","\127384") + ,("sound","\128265") + ,("space_invader","\128126") + ,("spades","\9824\65039") + ,("spaghetti","\127837") + ,("sparkle","\10055\65039") + ,("sparkler","\127879") + ,("sparkles","\10024") + ,("sparkling_heart","\128150") + ,("speak_no_evil","\128586") + ,("speaker","\128264") + ,("speech_balloon","\128172") + ,("speedboat","\128676") + ,("star","\11088") + ,("star2","\127775") + ,("stars","\127776") + ,("station","\128649") + ,("statue_of_liberty","\128509") + ,("steam_locomotive","\128642") + ,("stew","\127858") + ,("straight_ruler","\128207") + ,("strawberry","\127827") + ,("stuck_out_tongue","\128539") + ,("stuck_out_tongue_closed_eyes","\128541") + ,("stuck_out_tongue_winking_eye","\128540") + ,("sun_with_face","\127774") + ,("sunflower","\127803") + ,("sunglasses","\128526") + ,("sunny","\9728\65039") + ,("sunrise","\127749") + ,("sunrise_over_mountains","\127748") + ,("surfer","\127940") + ,("sushi","\127843") + ,("suspension_railway","\128671") + ,("sweat","\128531") + ,("sweat_drops","\128166") + ,("sweat_smile","\128517") + ,("sweet_potato","\127840") + ,("swimmer","\127946") + ,("symbols","\128291") + ,("syringe","\128137") + ,("tada","\127881") + ,("tanabata_tree","\127883") + ,("tangerine","\127818") + ,("taurus","\9801") + ,("taxi","\128661") + ,("tea","\127861") + ,("telephone","\9742\65039") + ,("telephone_receiver","\128222") + ,("telescope","\128301") + ,("tennis","\127934") + ,("tent","\9978") + ,("thought_balloon","\128173") + ,("three","3\65039\8419") + ,("thumbsdown","\128078") + ,("thumbsup","\128077") + ,("ticket","\127915") + ,("tiger","\128047") + ,("tiger2","\128005") + ,("tired_face","\128555") + ,("tm","\8482\65039") + ,("toilet","\128701") + ,("tokyo_tower","\128508") + ,("tomato","\127813") + ,("tongue","\128069") + ,("top","\128285") + ,("tophat","\127913") + ,("tractor","\128668") + ,("traffic_light","\128677") + ,("train","\128651") + ,("train2","\128646") + ,("tram","\128650") + ,("triangular_flag_on_post","\128681") + ,("triangular_ruler","\128208") + ,("trident","\128305") + ,("triumph","\128548") + ,("trolleybus","\128654") + ,("trophy","\127942") + ,("tropical_drink","\127865") + ,("tropical_fish","\128032") + ,("truck","\128666") + ,("trumpet","\127930") + ,("tshirt","\128085") + ,("tulip","\127799") + ,("turtle","\128034") + ,("tv","\128250") + ,("twisted_rightwards_arrows","\128256") + ,("two","2\65039\8419") + ,("two_hearts","\128149") + ,("two_men_holding_hands","\128108") + ,("two_women_holding_hands","\128109") + ,("u5272","\127545") + ,("u5408","\127540") + ,("u55b6","\127546") + ,("u6307","\127535") + ,("u6708","\127543\65039") + ,("u6709","\127542") + ,("u6e80","\127541") + ,("u7121","\127514") + ,("u7533","\127544") + ,("u7981","\127538") + ,("u7a7a","\127539") + ,("uk","\127468\127463") + ,("umbrella","\9748") + ,("unamused","\128530") + ,("underage","\128286") + ,("unlock","\128275") + ,("up","\127385") + ,("us","\127482\127480") + ,("v","\9996\65039") + ,("vertical_traffic_light","\128678") + ,("vhs","\128252") + ,("vibration_mode","\128243") + ,("video_camera","\128249") + ,("video_game","\127918") + ,("violin","\127931") + ,("virgo","\9805") + ,("volcano","\127755") + ,("vs","\127386") + ,("walking","\128694") + ,("waning_crescent_moon","\127768") + ,("waning_gibbous_moon","\127766") + ,("warning","\9888\65039") + ,("watch","\8986") + ,("water_buffalo","\128003") + ,("watermelon","\127817") + ,("wave","\128075") + ,("wavy_dash","\12336\65039") + ,("waxing_crescent_moon","\127762") + ,("waxing_gibbous_moon","\127764") + ,("wc","\128702") + ,("weary","\128553") + ,("wedding","\128146") + ,("whale","\128051") + ,("whale2","\128011") + ,("wheelchair","\9855") + ,("white_check_mark","\9989") + ,("white_circle","\9898") + ,("white_flower","\128174") + ,("white_large_square","\11036") + ,("white_medium_small_square","\9725") + ,("white_medium_square","\9723\65039") + ,("white_small_square","\9643\65039") + ,("white_square_button","\128307") + ,("wind_chime","\127888") + ,("wine_glass","\127863") + ,("wink","\128521") + ,("wolf","\128058") + ,("woman","\128105") + ,("womans_clothes","\128090") + ,("womans_hat","\128082") + ,("womens","\128698") + ,("worried","\128543") + ,("wrench","\128295") + ,("x","\10060") + ,("yellow_heart","\128155") + ,("yen","\128180") + ,("yum","\128523") + ,("zap","\9889") + ,("zero","0\65039\8419") + ,("zzz","\128164") + ] + diff --git a/src/Text/Pandoc/Error.hs b/src/Text/Pandoc/Error.hs new file mode 100644 index 000000000..65f912c88 --- /dev/null +++ b/src/Text/Pandoc/Error.hs @@ -0,0 +1,76 @@ +{-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} +{- +Copyright (C) 2006-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} +{- | + Module : Text.Pandoc.Error + Copyright : Copyright (C) 2006-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +This module provides a standard way to deal with possible errors encounted +during parsing. + +-} +module Text.Pandoc.Error ( + PandocError(..), + handleError) where + +import Text.Parsec.Error +import Text.Parsec.Pos hiding (Line) +import Data.Generics (Typeable) +import GHC.Generics (Generic) +import Control.Exception (Exception) +import Text.Pandoc.Shared (err) + +type Input = String + +data PandocError = PandocFileReadError FilePath + | PandocShouldNeverHappenError String + | PandocSomeError String + | PandocParseError String + | PandocParsecError Input ParseError + | PandocMakePDFError String + deriving (Show, Typeable, Generic) + +instance Exception PandocError + +-- | Handle PandocError by exiting with an error message. +handleError :: Either PandocError a -> IO a +handleError (Right r) = return r +handleError (Left e) = + case e of + PandocFileReadError fp -> err 61 $ "problem reading " ++ fp + PandocShouldNeverHappenError s -> err 62 s + PandocSomeError s -> err 63 s + PandocParseError s -> err 64 s + PandocParsecError input err' -> + let errPos = errorPos err' + errLine = sourceLine errPos + errColumn = sourceColumn errPos + ls = lines input ++ [""] + errorInFile = if length ls > errLine - 1 + then concat ["\n", (ls !! (errLine - 1)) + ,"\n", replicate (errColumn - 1) ' ' + ,"^"] + else "" + in err 65 $ "\nError at " ++ show err' ++ errorInFile + PandocMakePDFError s -> err 65 s + diff --git a/src/Text/Pandoc/Extensions.hs b/src/Text/Pandoc/Extensions.hs new file mode 100644 index 000000000..d5e59e8e1 --- /dev/null +++ b/src/Text/Pandoc/Extensions.hs @@ -0,0 +1,267 @@ +{-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} +{- +Copyright (C) 2012-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Extensions + Copyright : Copyright (C) 2012-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Data structures and functions for representing markup extensions. +-} +module Text.Pandoc.Extensions ( Extension(..) + , Extensions + , emptyExtensions + , extensionsFromList + , extensionEnabled + , enableExtension + , disableExtension + , pandocExtensions + , plainExtensions + , strictExtensions + , phpMarkdownExtraExtensions + , githubMarkdownExtensions + , multimarkdownExtensions ) +where +import Data.Bits (testBit, setBit, clearBit) +import Data.Data (Data) +import Data.Typeable (Typeable) +import GHC.Generics (Generic) + +newtype Extensions = Extensions Integer + deriving (Show, Read, Eq, Ord, Data, Typeable, Generic) + +extensionsFromList :: [Extension] -> Extensions +extensionsFromList = foldr enableExtension emptyExtensions + +emptyExtensions :: Extensions +emptyExtensions = Extensions 0 + +extensionEnabled :: Extension -> Extensions -> Bool +extensionEnabled x (Extensions exts) = testBit exts (fromEnum x) + +enableExtension :: Extension -> Extensions -> Extensions +enableExtension x (Extensions exts) = Extensions (setBit exts (fromEnum x)) + +disableExtension :: Extension -> Extensions -> Extensions +disableExtension x (Extensions exts) = Extensions (clearBit exts (fromEnum x)) + +-- | Individually selectable syntax extensions. +data Extension = + Ext_footnotes -- ^ Pandoc/PHP/MMD style footnotes + | Ext_inline_notes -- ^ Pandoc-style inline notes + | Ext_pandoc_title_block -- ^ Pandoc title block + | Ext_yaml_metadata_block -- ^ YAML metadata block + | Ext_mmd_title_block -- ^ Multimarkdown metadata block + | Ext_table_captions -- ^ Pandoc-style table captions + | Ext_implicit_figures -- ^ A paragraph with just an image is a figure + | Ext_simple_tables -- ^ Pandoc-style simple tables + | Ext_multiline_tables -- ^ Pandoc-style multiline tables + | Ext_grid_tables -- ^ Grid tables (pandoc, reST) + | Ext_pipe_tables -- ^ Pipe tables (as in PHP markdown extra) + | Ext_citations -- ^ Pandoc/citeproc citations + | Ext_raw_tex -- ^ Allow raw TeX (other than math) + | Ext_raw_html -- ^ Allow raw HTML + | Ext_tex_math_dollars -- ^ TeX math between $..$ or $$..$$ + | Ext_tex_math_single_backslash -- ^ TeX math btw \(..\) \[..\] + | Ext_tex_math_double_backslash -- ^ TeX math btw \\(..\\) \\[..\\] + | Ext_latex_macros -- ^ Parse LaTeX macro definitions (for math only) + | Ext_fenced_code_blocks -- ^ Parse fenced code blocks + | Ext_fenced_code_attributes -- ^ Allow attributes on fenced code blocks + | Ext_backtick_code_blocks -- ^ GitHub style ``` code blocks + | Ext_inline_code_attributes -- ^ Allow attributes on inline code + | Ext_markdown_in_html_blocks -- ^ Interpret as markdown inside HTML blocks + | Ext_native_divs -- ^ Use Div blocks for contents of <div> tags + | Ext_native_spans -- ^ Use Span inlines for contents of <span> + | Ext_bracketed_spans -- ^ Bracketed spans with attributes + | Ext_markdown_attribute -- ^ Interpret text inside HTML as markdown + -- iff container has attribute 'markdown' + | Ext_escaped_line_breaks -- ^ Treat a backslash at EOL as linebreak + | Ext_link_attributes -- ^ link and image attributes + | Ext_mmd_link_attributes -- ^ MMD style reference link attributes + | Ext_autolink_bare_uris -- ^ Make all absolute URIs into links + | Ext_fancy_lists -- ^ Enable fancy list numbers and delimiters + | Ext_lists_without_preceding_blankline -- ^ Allow lists without preceding blank + | Ext_startnum -- ^ Make start number of ordered list significant + | Ext_definition_lists -- ^ Definition lists as in pandoc, mmd, php + | Ext_compact_definition_lists -- ^ Definition lists without + -- space between items, and disallow laziness + | Ext_example_lists -- ^ Markdown-style numbered examples + | Ext_all_symbols_escapable -- ^ Make all non-alphanumerics escapable + | Ext_angle_brackets_escapable -- ^ Make < and > escapable + | Ext_intraword_underscores -- ^ Treat underscore inside word as literal + | Ext_blank_before_blockquote -- ^ Require blank line before a blockquote + | Ext_blank_before_header -- ^ Require blank line before a header + | Ext_strikeout -- ^ Strikeout using ~~this~~ syntax + | Ext_superscript -- ^ Superscript using ^this^ syntax + | Ext_subscript -- ^ Subscript using ~this~ syntax + | Ext_hard_line_breaks -- ^ All newlines become hard line breaks + | Ext_ignore_line_breaks -- ^ Newlines in paragraphs are ignored + | Ext_east_asian_line_breaks -- ^ Newlines in paragraphs are ignored between + -- East Asian wide characters + | Ext_literate_haskell -- ^ Enable literate Haskell conventions + | Ext_abbreviations -- ^ PHP markdown extra abbreviation definitions + | Ext_emoji -- ^ Support emoji like :smile: + | Ext_auto_identifiers -- ^ Automatic identifiers for headers + | Ext_ascii_identifiers -- ^ ascii-only identifiers for headers + | Ext_header_attributes -- ^ Explicit header attributes {#id .class k=v} + | Ext_mmd_header_identifiers -- ^ Multimarkdown style header identifiers [myid] + | Ext_implicit_header_references -- ^ Implicit reference links for headers + | Ext_line_blocks -- ^ RST style line blocks + | Ext_epub_html_exts -- ^ Recognise the EPUB extended version of HTML + | Ext_shortcut_reference_links -- ^ Shortcut reference links + | Ext_smart -- ^ "Smart" quotes, apostrophes, ellipses, dashes + | Ext_old_dashes -- ^ -- = em, - before number = en + deriving (Show, Read, Enum, Eq, Ord, Bounded, Data, Typeable, Generic) + +pandocExtensions :: Extensions +pandocExtensions = extensionsFromList + [ Ext_footnotes + , Ext_inline_notes + , Ext_pandoc_title_block + , Ext_yaml_metadata_block + , Ext_table_captions + , Ext_implicit_figures + , Ext_simple_tables + , Ext_multiline_tables + , Ext_grid_tables + , Ext_pipe_tables + , Ext_citations + , Ext_raw_tex + , Ext_raw_html + , Ext_tex_math_dollars + , Ext_latex_macros + , Ext_fenced_code_blocks + , Ext_fenced_code_attributes + , Ext_backtick_code_blocks + , Ext_inline_code_attributes + , Ext_markdown_in_html_blocks + , Ext_native_divs + , Ext_native_spans + , Ext_bracketed_spans + , Ext_escaped_line_breaks + , Ext_fancy_lists + , Ext_startnum + , Ext_definition_lists + , Ext_example_lists + , Ext_all_symbols_escapable + , Ext_intraword_underscores + , Ext_blank_before_blockquote + , Ext_blank_before_header + , Ext_strikeout + , Ext_superscript + , Ext_subscript + , Ext_auto_identifiers + , Ext_header_attributes + , Ext_link_attributes + , Ext_implicit_header_references + , Ext_line_blocks + , Ext_shortcut_reference_links + , Ext_smart + ] + +plainExtensions :: Extensions +plainExtensions = extensionsFromList + [ Ext_table_captions + , Ext_implicit_figures + , Ext_simple_tables + , Ext_multiline_tables + , Ext_grid_tables + , Ext_latex_macros + , Ext_fancy_lists + , Ext_startnum + , Ext_definition_lists + , Ext_example_lists + , Ext_intraword_underscores + , Ext_blank_before_blockquote + , Ext_blank_before_header + , Ext_strikeout + ] + +phpMarkdownExtraExtensions :: Extensions +phpMarkdownExtraExtensions = extensionsFromList + [ Ext_footnotes + , Ext_pipe_tables + , Ext_raw_html + , Ext_markdown_attribute + , Ext_fenced_code_blocks + , Ext_definition_lists + , Ext_intraword_underscores + , Ext_header_attributes + , Ext_link_attributes + , Ext_abbreviations + , Ext_shortcut_reference_links + ] + +githubMarkdownExtensions :: Extensions +githubMarkdownExtensions = extensionsFromList + [ Ext_angle_brackets_escapable + , Ext_pipe_tables + , Ext_raw_html + , Ext_fenced_code_blocks + , Ext_auto_identifiers + , Ext_ascii_identifiers + , Ext_backtick_code_blocks + , Ext_autolink_bare_uris + , Ext_intraword_underscores + , Ext_strikeout + , Ext_hard_line_breaks + , Ext_emoji + , Ext_lists_without_preceding_blankline + , Ext_shortcut_reference_links + ] + +multimarkdownExtensions :: Extensions +multimarkdownExtensions = extensionsFromList + [ Ext_pipe_tables + , Ext_raw_html + , Ext_markdown_attribute + , Ext_mmd_link_attributes + -- , Ext_raw_tex + -- Note: MMD's raw TeX syntax requires raw TeX to be + -- enclosed in HTML comment + , Ext_tex_math_double_backslash + , Ext_intraword_underscores + , Ext_mmd_title_block + , Ext_footnotes + , Ext_definition_lists + , Ext_all_symbols_escapable + , Ext_implicit_header_references + , Ext_auto_identifiers + , Ext_mmd_header_identifiers + , Ext_implicit_figures + -- Note: MMD's syntax for superscripts and subscripts + -- is a bit more permissive than pandoc's, allowing + -- e^2 and a~1 instead of e^2^ and a~1~, so even with + -- these options we don't have full support for MMD + -- superscripts and subscripts, but there's no reason + -- not to include these: + , Ext_superscript + , Ext_subscript + ] + +strictExtensions :: Extensions +strictExtensions = extensionsFromList + [ Ext_raw_html + , Ext_shortcut_reference_links + ] + diff --git a/src/Text/Pandoc/Highlighting.hs b/src/Text/Pandoc/Highlighting.hs new file mode 100644 index 000000000..df060915c --- /dev/null +++ b/src/Text/Pandoc/Highlighting.hs @@ -0,0 +1,223 @@ +{- +Copyright (C) 2008-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Highlighting + Copyright : Copyright (C) 2008-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Exports functions for syntax highlighting. +-} + +module Text.Pandoc.Highlighting ( highlightingStyles + , languages + , languagesByExtension + , highlight + , formatLaTeXInline + , formatLaTeXBlock + , styleToLaTeX + , formatHtmlInline + , formatHtmlBlock + , styleToCss + , pygments + , espresso + , zenburn + , tango + , kate + , monochrome + , haddock + , Style + , fromListingsLanguage + , toListingsLanguage + ) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared (safeRead) +import Skylighting +import Data.Maybe (fromMaybe) +import Data.Char (toLower) +import qualified Data.Map as M +import Control.Monad +import qualified Data.Text as T + +highlightingStyles :: [(String, Style)] +highlightingStyles = + [("pygments", pygments), + ("tango", tango), + ("espresso", espresso), + ("zenburn", zenburn), + ("kate", kate), + ("monochrome", monochrome), + ("breezedark", breezeDark), + ("haddock", haddock)] + +languages :: [String] +languages = [T.unpack (T.toLower (sName s)) | s <- M.elems defaultSyntaxMap] + +languagesByExtension :: String -> [String] +languagesByExtension ext = + [T.unpack (T.toLower (sName s)) | s <- syntaxesByExtension defaultSyntaxMap ext] + +highlight :: (FormatOptions -> [SourceLine] -> a) -- ^ Formatter + -> Attr -- ^ Attributes of the CodeBlock + -> String -- ^ Raw contents of the CodeBlock + -> Maybe a -- ^ Maybe the formatted result +highlight formatter (_, classes, keyvals) rawCode = + let firstNum = fromMaybe 1 (safeRead (fromMaybe "1" $ lookup "startFrom" keyvals)) + fmtOpts = defaultFormatOpts{ + startNumber = firstNum, + numberLines = any (`elem` + ["number","numberLines", "number-lines"]) classes } + tokenizeOpts = TokenizerConfig{ syntaxMap = defaultSyntaxMap + , traceOutput = False } + classes' = map T.pack classes + rawCode' = T.pack rawCode + in case msum (map (\l -> lookupSyntax l defaultSyntaxMap) classes') of + Nothing + | numberLines fmtOpts -> Just + $ formatter fmtOpts{ codeClasses = [], + containerClasses = classes' } + $ map (\ln -> [(NormalTok, ln)]) $ T.lines rawCode' + | otherwise -> Nothing + Just syntax -> + case tokenize tokenizeOpts syntax rawCode' of + Right slines -> Just $ + formatter fmtOpts{ codeClasses = + [T.toLower (sShortname syntax)], + containerClasses = classes' } slines + Left _ -> Nothing + +-- Functions for correlating latex listings package's language names +-- with skylighting language names: + +langToListingsMap :: M.Map String String +langToListingsMap = M.fromList langsList + +listingsToLangMap :: M.Map String String +listingsToLangMap = M.fromList $ map switch langsList + where switch (a,b) = (b,a) + +langsList :: [(String, String)] +langsList = + [("abap","ABAP"), + ("acm","ACM"), + ("acmscript","ACMscript"), + ("acsl","ACSL"), + ("ada","Ada"), + ("algol","Algol"), + ("ant","Ant"), + ("assembler","Assembler"), + ("gnuassembler","Assembler"), + ("awk","Awk"), + ("bash","bash"), + ("monobasic","Basic"), + ("purebasic","Basic"), + ("c","C"), + ("cpp","C++"), + ("c++","C++"), + ("ocaml","Caml"), + ("cil","CIL"), + ("clean","Clean"), + ("cobol","Cobol"), + ("comal80","Comal80"), + ("command.com","command.com"), + ("comsol","Comsol"), + ("csh","csh"), + ("delphi","Delphi"), + ("elan","Elan"), + ("erlang","erlang"), + ("euphoria","Euphoria"), + ("fortran","Fortran"), + ("gap","GAP"), + ("gcl","GCL"), + ("gnuplot","Gnuplot"), + ("hansl","hansl"), + ("haskell","Haskell"), + ("html","HTML"), + ("idl","IDL"), + ("inform","inform"), + ("java","Java"), + ("jvmis","JVMIS"), + ("ksh","ksh"), + ("lingo","Lingo"), + ("lisp","Lisp"), + ("commonlisp","Lisp"), + ("llvm","LLVM"), + ("logo","Logo"), + ("lua","Lua"), + ("make","make"), + ("makefile","make"), + ("mathematica","Mathematica"), + ("matlab","Matlab"), + ("mercury","Mercury"), + ("metapost","MetaPost"), + ("miranda","Miranda"), + ("mizar","Mizar"), + ("ml","ML"), + ("modula2","Modula-2"), + ("mupad","MuPAD"), + ("nastran","NASTRAN"), + ("oberon2","Oberon-2"), + ("ocl","OCL"), + ("octave","Octave"), + ("oz","Oz"), + ("pascal","Pascal"), + ("perl","Perl"), + ("php","PHP"), + ("pli","PL/I"), + ("plasm","Plasm"), + ("postscript","PostScript"), + ("pov","POV"), + ("prolog","Prolog"), + ("promela","Promela"), + ("pstricks","PSTricks"), + ("python","Python"), + ("r","R"), + ("reduce","Reduce"), + ("rexx","Rexx"), + ("rsl","RSL"), + ("ruby","Ruby"), + ("s","S"), + ("sas","SAS"), + ("scala","Scala"), + ("scilab","Scilab"), + ("sh","sh"), + ("shelxl","SHELXL"), + ("simula","Simula"), + ("sparql","SPARQL"), + ("sql","SQL"), + ("tcl","tcl"), + ("tex","TeX"), + ("latex","TeX"), + ("vbscript","VBScript"), + ("verilog","Verilog"), + ("vhdl","VHDL"), + ("vrml","VRML"), + ("xml","XML"), + ("xslt","XSLT")] + +-- | Determine listings language name from skylighting language name. +toListingsLanguage :: String -> Maybe String +toListingsLanguage lang = M.lookup (map toLower lang) langToListingsMap + +-- | Determine skylighting language name from listings language name. +fromListingsLanguage :: String -> Maybe String +fromListingsLanguage lang = M.lookup lang listingsToLangMap diff --git a/src/Text/Pandoc/ImageSize.hs b/src/Text/Pandoc/ImageSize.hs new file mode 100644 index 000000000..cc22c06ca --- /dev/null +++ b/src/Text/Pandoc/ImageSize.hs @@ -0,0 +1,547 @@ +{-# LANGUAGE OverloadedStrings, ScopedTypeVariables, CPP #-} +{-# OPTIONS_GHC -fno-warn-type-defaults #-} +{- + Copyright (C) 2011-2016 John MacFarlane <jgm@berkeley.edu> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., 59 + Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | +Module : Text.Pandoc.ImageSize +Copyright : Copyright (C) 2011-2016 John MacFarlane +License : GNU GPL, version 2 or above + +Maintainer : John MacFarlane <jgm@berkeley.edu> +Stability : alpha +Portability : portable + +Functions for determining the size of a PNG, JPEG, or GIF image. +-} +module Text.Pandoc.ImageSize ( ImageType(..) + , imageType + , imageSize + , sizeInPixels + , sizeInPoints + , desiredSizeInPoints + , Dimension(..) + , Direction(..) + , dimension + , inInch + , inPoints + , numUnit + , showInInch + , showInPixel + , showFl + ) where +import Data.ByteString (ByteString, unpack) +import qualified Data.ByteString.Char8 as B +import qualified Data.ByteString.Lazy as BL +import Data.Char (isDigit) +import Control.Monad +import Data.Bits +import Data.Binary +import Data.Binary.Get +import Text.Pandoc.Shared (safeRead) +import Data.Default (Default) +import Numeric (showFFloat) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import qualified Data.Map as M +import Control.Monad.Except +import Data.Maybe (fromMaybe) + +-- quick and dirty functions to get image sizes +-- algorithms borrowed from wwwis.pl + +data ImageType = Png | Gif | Jpeg | Pdf | Eps deriving Show +data Direction = Width | Height +instance Show Direction where + show Width = "width" + show Height = "height" + +data Dimension = Pixel Integer + | Centimeter Double + | Inch Double + | Percent Double +instance Show Dimension where + show (Pixel a) = show a ++ "px" + show (Centimeter a) = showFl a ++ "cm" + show (Inch a) = showFl a ++ "in" + show (Percent a) = show a ++ "%" + +data ImageSize = ImageSize{ + pxX :: Integer + , pxY :: Integer + , dpiX :: Integer + , dpiY :: Integer + } deriving (Read, Show, Eq) +instance Default ImageSize where + def = ImageSize 300 200 72 72 + +showFl :: (RealFloat a) => a -> String +showFl a = showFFloat (Just 5) a "" + +imageType :: ByteString -> Maybe ImageType +imageType img = case B.take 4 img of + "\x89\x50\x4e\x47" -> return Png + "\x47\x49\x46\x38" -> return Gif + "\xff\xd8\xff\xe0" -> return Jpeg -- JFIF + "\xff\xd8\xff\xe1" -> return Jpeg -- Exif + "%PDF" -> return Pdf + "%!PS" + | (B.take 4 $ B.drop 1 $ B.dropWhile (/=' ') img) == "EPSF" + -> return Eps + _ -> mzero + +imageSize :: ByteString -> Either String ImageSize +imageSize img = + case imageType img of + Just Png -> mbToEither "could not determine PNG size" $ pngSize img + Just Gif -> mbToEither "could not determine GIF size" $ gifSize img + Just Jpeg -> jpegSize img + Just Eps -> mbToEither "could not determine EPS size" $ epsSize img + Just Pdf -> Left "could not determine PDF size" -- TODO + Nothing -> Left "could not determine image type" + where mbToEither msg Nothing = Left msg + mbToEither _ (Just x) = Right x + +defaultSize :: (Integer, Integer) +defaultSize = (72, 72) + +sizeInPixels :: ImageSize -> (Integer, Integer) +sizeInPixels s = (pxX s, pxY s) + +-- | Calculate (height, width) in points using the image file's dpi metadata, +-- using 72 Points == 1 Inch. +sizeInPoints :: ImageSize -> (Double, Double) +sizeInPoints s = (pxXf * 72 / dpiXf, pxYf * 72 / dpiYf) + where + pxXf = fromIntegral $ pxX s + pxYf = fromIntegral $ pxY s + dpiXf = fromIntegral $ dpiX s + dpiYf = fromIntegral $ dpiY s + +-- | Calculate (height, width) in points, considering the desired dimensions in the +-- attribute, while falling back on the image file's dpi metadata if no dimensions +-- are specified in the attribute (or only dimensions in percentages). +desiredSizeInPoints :: WriterOptions -> Attr -> ImageSize -> (Double, Double) +desiredSizeInPoints opts attr s = + case (getDim Width, getDim Height) of + (Just w, Just h) -> (w, h) + (Just w, Nothing) -> (w, w / ratio) + (Nothing, Just h) -> (h * ratio, h) + (Nothing, Nothing) -> sizeInPoints s + where + ratio = fromIntegral (pxX s) / fromIntegral (pxY s) + getDim dir = case (dimension dir attr) of + Just (Percent _) -> Nothing + Just dim -> Just $ inPoints opts dim + Nothing -> Nothing + +inPoints :: WriterOptions -> Dimension -> Double +inPoints opts dim = 72 * inInch opts dim + +inInch :: WriterOptions -> Dimension -> Double +inInch opts dim = + case dim of + (Pixel a) -> fromIntegral a / (fromIntegral $ writerDpi opts) + (Centimeter a) -> a * 0.3937007874 + (Inch a) -> a + (Percent _) -> 0 + +-- | Convert a Dimension to a String denoting its equivalent in inches, for example "2.00000". +-- Note: Dimensions in percentages are converted to the empty string. +showInInch :: WriterOptions -> Dimension -> String +showInInch _ (Percent _) = "" +showInInch opts dim = showFl $ inInch opts dim + +-- | Convert a Dimension to a String denoting its equivalent in pixels, for example "600". +-- Note: Dimensions in percentages are converted to the empty string. +showInPixel :: WriterOptions -> Dimension -> String +showInPixel opts dim = + case dim of + (Pixel a) -> show a + (Centimeter a) -> show (floor $ dpi * a * 0.3937007874 :: Int) + (Inch a) -> show (floor $ dpi * a :: Int) + (Percent _) -> "" + where + dpi = fromIntegral $ writerDpi opts + +-- | Maybe split a string into a leading number and trailing unit, e.g. "3cm" to Just (3.0, "cm") +numUnit :: String -> Maybe (Double, String) +numUnit s = + let (nums, unit) = span (\c -> isDigit c || ('.'==c)) s + in case safeRead nums of + Just n -> Just (n, unit) + Nothing -> Nothing + +-- | Read a Dimension from an Attr attribute. +-- `dimension Width attr` might return `Just (Pixel 3)` or for example `Just (Centimeter 2.0)`, etc. +dimension :: Direction -> Attr -> Maybe Dimension +dimension dir (_, _, kvs) = + case dir of + Width -> extractDim "width" + Height -> extractDim "height" + where + extractDim key = + case lookup key kvs of + Just str -> + case numUnit str of + Just (num, unit) -> toDim num unit + Nothing -> Nothing + Nothing -> Nothing + toDim a "cm" = Just $ Centimeter a + toDim a "mm" = Just $ Centimeter (a / 10) + toDim a "in" = Just $ Inch a + toDim a "inch" = Just $ Inch a + toDim a "%" = Just $ Percent a + toDim a "px" = Just $ Pixel (floor a::Integer) + toDim a "" = Just $ Pixel (floor a::Integer) + toDim _ _ = Nothing + +epsSize :: ByteString -> Maybe ImageSize +epsSize img = do + let ls = takeWhile ("%" `B.isPrefixOf`) $ B.lines img + let ls' = dropWhile (not . ("%%BoundingBox:" `B.isPrefixOf`)) ls + case ls' of + [] -> mzero + (x:_) -> case B.words x of + (_:_:_:ux:uy:[]) -> do + ux' <- safeRead $ B.unpack ux + uy' <- safeRead $ B.unpack uy + return ImageSize{ + pxX = ux' + , pxY = uy' + , dpiX = 72 + , dpiY = 72 } + _ -> mzero + +pngSize :: ByteString -> Maybe ImageSize +pngSize img = do + let (h, rest) = B.splitAt 8 img + guard $ h == "\x8a\x4d\x4e\x47\x0d\x0a\x1a\x0a" || + h == "\x89\x50\x4e\x47\x0d\x0a\x1a\x0a" + let (i, rest') = B.splitAt 4 $ B.drop 4 rest + guard $ i == "MHDR" || i == "IHDR" + let (sizes, rest'') = B.splitAt 8 rest' + (x,y) <- case map fromIntegral $ unpack $ sizes of + ([w1,w2,w3,w4,h1,h2,h3,h4] :: [Integer]) -> return + ((shift w1 24) + (shift w2 16) + (shift w3 8) + w4, + (shift h1 24) + (shift h2 16) + (shift h3 8) + h4) + _ -> Nothing -- "PNG parse error" + let (dpix, dpiy) = findpHYs rest'' + return $ ImageSize { pxX = x, pxY = y, dpiX = dpix, dpiY = dpiy } + +findpHYs :: ByteString -> (Integer, Integer) +findpHYs x = + if B.null x || "IDAT" `B.isPrefixOf` x + then (72,72) -- default, no pHYs + else if "pHYs" `B.isPrefixOf` x + then let [x1,x2,x3,x4,y1,y2,y3,y4,u] = map fromIntegral + $ unpack $ B.take 9 $ B.drop 4 x + factor = if u == 1 -- dots per meter + then \z -> z * 254 `div` 10000 + else const 72 + in ( factor $ (shift x1 24) + (shift x2 16) + (shift x3 8) + x4, + factor $ (shift y1 24) + (shift y2 16) + (shift y3 8) + y4 ) + else findpHYs $ B.drop 1 x -- read another byte + +gifSize :: ByteString -> Maybe ImageSize +gifSize img = do + let (h, rest) = B.splitAt 6 img + guard $ h == "GIF87a" || h == "GIF89a" + case map fromIntegral $ unpack $ B.take 4 rest of + [w2,w1,h2,h1] -> return ImageSize { + pxX = shift w1 8 + w2, + pxY = shift h1 8 + h2, + dpiX = 72, + dpiY = 72 + } + _ -> Nothing -- "GIF parse error" + +jpegSize :: ByteString -> Either String ImageSize +jpegSize img = + let (hdr, rest) = B.splitAt 4 img + in if B.length rest < 14 + then Left "unable to determine JPEG size" + else case hdr of + "\xff\xd8\xff\xe0" -> jfifSize rest + "\xff\xd8\xff\xe1" -> exifSize rest + _ -> Left "unable to determine JPEG size" + +jfifSize :: ByteString -> Either String ImageSize +jfifSize rest = + let [dpiDensity,dpix1,dpix2,dpiy1,dpiy2] = map fromIntegral + $ unpack $ B.take 5 $ B.drop 9 $ rest + factor = case dpiDensity of + 1 -> id + 2 -> \x -> (x * 254 `div` 10) + _ -> const 72 + dpix = factor (shift dpix1 8 + dpix2) + dpiy = factor (shift dpiy1 8 + dpiy2) + in case findJfifSize rest of + Left msg -> Left msg + Right (w,h) -> Right $ ImageSize { pxX = w + , pxY = h + , dpiX = dpix + , dpiY = dpiy } + +findJfifSize :: ByteString -> Either String (Integer,Integer) +findJfifSize bs = + let bs' = B.dropWhile (=='\xff') $ B.dropWhile (/='\xff') bs + in case B.uncons bs' of + Just (c,bs'') | c >= '\xc0' && c <= '\xc3' -> + case map fromIntegral $ unpack $ B.take 4 $ B.drop 3 bs'' of + [h1,h2,w1,w2] -> Right (shift w1 8 + w2, shift h1 8 + h2) + _ -> Left "JFIF parse error" + Just (_,bs'') -> + case map fromIntegral $ unpack $ B.take 2 bs'' of + [c1,c2] -> + let len = shift c1 8 + c2 + -- skip variables + in findJfifSize $ B.drop len bs'' + _ -> Left "JFIF parse error" + Nothing -> Left "Did not find JFIF length record" + +runGet' :: Get (Either String a) -> BL.ByteString -> Either String a +runGet' p bl = +#if MIN_VERSION_binary(0,7,0) + case runGetOrFail p bl of + Left (_,_,msg) -> Left msg + Right (_,_,x) -> x +#else + runGet p bl +#endif + + +exifSize :: ByteString -> Either String ImageSize +exifSize bs = runGet' header $ bl + where bl = BL.fromChunks [bs] + header = runExceptT $ exifHeader bl +-- NOTE: It would be nicer to do +-- runGet ((Just <$> exifHeader) <|> return Nothing) +-- which would prevent pandoc from raising an error when an exif header can't +-- be parsed. But we only get an Alternative instance for Get in binary 0.6, +-- and binary 0.5 ships with ghc 7.6. + +exifHeader :: BL.ByteString -> ExceptT String Get ImageSize +exifHeader hdr = do + _app1DataSize <- lift getWord16be + exifHdr <- lift getWord32be + unless (exifHdr == 0x45786966) $ throwError "Did not find exif header" + zeros <- lift getWord16be + unless (zeros == 0) $ throwError "Expected zeros after exif header" + -- beginning of tiff header -- we read whole thing to use + -- in getting data from offsets: + let tiffHeader = BL.drop 8 hdr + byteAlign <- lift getWord16be + let bigEndian = byteAlign == 0x4d4d + let (getWord16, getWord32, getWord64) = + if bigEndian + then (getWord16be, getWord32be, getWord64be) + else (getWord16le, getWord32le, getWord64le) + let getRational = do + num <- getWord32 + den <- getWord32 + return $ fromIntegral num / fromIntegral den + tagmark <- lift getWord16 + unless (tagmark == 0x002a) $ throwError "Failed alignment sanity check" + ifdOffset <- lift getWord32 + lift $ skip (fromIntegral ifdOffset - 8) -- skip to IDF + numentries <- lift getWord16 + let ifdEntry :: ExceptT String Get (TagType, DataFormat) + ifdEntry = do + tag <- fromMaybe UnknownTagType . flip M.lookup tagTypeTable + <$> lift getWord16 + dataFormat <- lift getWord16 + numComponents <- lift getWord32 + (fmt, bytesPerComponent) <- + case dataFormat of + 1 -> return (UnsignedByte <$> getWord8, 1) + 2 -> return (AsciiString <$> + getLazyByteString + (fromIntegral numComponents), 1) + 3 -> return (UnsignedShort <$> getWord16, 2) + 4 -> return (UnsignedLong <$> getWord32, 4) + 5 -> return (UnsignedRational <$> getRational, 8) + 6 -> return (SignedByte <$> getWord8, 1) + 7 -> return (Undefined <$> getLazyByteString + (fromIntegral numComponents), 1) + 8 -> return (SignedShort <$> getWord16, 2) + 9 -> return (SignedLong <$> getWord32, 4) + 10 -> return (SignedRational <$> getRational, 8) + 11 -> return (SingleFloat <$> getWord32 {- TODO -}, 4) + 12 -> return (DoubleFloat <$> getWord64 {- TODO -}, 8) + _ -> throwError $ "Unknown data format " ++ show dataFormat + let totalBytes = fromIntegral $ numComponents * bytesPerComponent + payload <- if totalBytes <= 4 -- data is right here + then lift $ fmt <* skip (4 - totalBytes) + else do -- get data from offset + offs <- lift getWord32 + let bytesAtOffset = + BL.take (fromIntegral totalBytes) + $ BL.drop (fromIntegral offs) tiffHeader + case runGet' (Right <$> fmt) bytesAtOffset of + Left msg -> throwError msg + Right x -> return x + return (tag, payload) + entries <- sequence $ replicate (fromIntegral numentries) ifdEntry + subentries <- case lookup ExifOffset entries of + Just (UnsignedLong offset') -> do + pos <- lift bytesRead + lift $ skip (fromIntegral offset' - (fromIntegral pos - 8)) + numsubentries <- lift getWord16 + sequence $ + replicate (fromIntegral numsubentries) ifdEntry + _ -> return [] + let allentries = entries ++ subentries + (wdth, hght) <- case (lookup ExifImageWidth allentries, + lookup ExifImageHeight allentries) of + (Just (UnsignedLong w), Just (UnsignedLong h)) -> + return (fromIntegral w, fromIntegral h) + _ -> return defaultSize + -- we return a default width and height when + -- the exif header doesn't contain these + let resfactor = case lookup ResolutionUnit allentries of + Just (UnsignedShort 1) -> (100 / 254) + _ -> 1 + let xres = maybe 72 (\(UnsignedRational x) -> floor $ x * resfactor) + $ lookup XResolution allentries + let yres = maybe 72 (\(UnsignedRational x) -> floor $ x * resfactor) + $ lookup YResolution allentries + return $ ImageSize{ + pxX = wdth + , pxY = hght + , dpiX = xres + , dpiY = yres } + +data DataFormat = UnsignedByte Word8 + | AsciiString BL.ByteString + | UnsignedShort Word16 + | UnsignedLong Word32 + | UnsignedRational Rational + | SignedByte Word8 + | Undefined BL.ByteString + | SignedShort Word16 + | SignedLong Word32 + | SignedRational Rational + | SingleFloat Word32 + | DoubleFloat Word64 + deriving (Show) + +data TagType = ImageDescription + | Make + | Model + | Orientation + | XResolution + | YResolution + | ResolutionUnit + | Software + | DateTime + | WhitePoint + | PrimaryChromaticities + | YCbCrCoefficients + | YCbCrPositioning + | ReferenceBlackWhite + | Copyright + | ExifOffset + | ExposureTime + | FNumber + | ExposureProgram + | ISOSpeedRatings + | ExifVersion + | DateTimeOriginal + | DateTimeDigitized + | ComponentConfiguration + | CompressedBitsPerPixel + | ShutterSpeedValue + | ApertureValue + | BrightnessValue + | ExposureBiasValue + | MaxApertureValue + | SubjectDistance + | MeteringMode + | LightSource + | Flash + | FocalLength + | MakerNote + | UserComment + | FlashPixVersion + | ColorSpace + | ExifImageWidth + | ExifImageHeight + | RelatedSoundFile + | ExifInteroperabilityOffset + | FocalPlaneXResolution + | FocalPlaneYResolution + | FocalPlaneResolutionUnit + | SensingMethod + | FileSource + | SceneType + | UnknownTagType + deriving (Show, Eq, Ord) + +tagTypeTable :: M.Map Word16 TagType +tagTypeTable = M.fromList + [ (0x010e, ImageDescription) + , (0x010f, Make) + , (0x0110, Model) + , (0x0112, Orientation) + , (0x011a, XResolution) + , (0x011b, YResolution) + , (0x0128, ResolutionUnit) + , (0x0131, Software) + , (0x0132, DateTime) + , (0x013e, WhitePoint) + , (0x013f, PrimaryChromaticities) + , (0x0211, YCbCrCoefficients) + , (0x0213, YCbCrPositioning) + , (0x0214, ReferenceBlackWhite) + , (0x8298, Copyright) + , (0x8769, ExifOffset) + , (0x829a, ExposureTime) + , (0x829d, FNumber) + , (0x8822, ExposureProgram) + , (0x8827, ISOSpeedRatings) + , (0x9000, ExifVersion) + , (0x9003, DateTimeOriginal) + , (0x9004, DateTimeDigitized) + , (0x9101, ComponentConfiguration) + , (0x9102, CompressedBitsPerPixel) + , (0x9201, ShutterSpeedValue) + , (0x9202, ApertureValue) + , (0x9203, BrightnessValue) + , (0x9204, ExposureBiasValue) + , (0x9205, MaxApertureValue) + , (0x9206, SubjectDistance) + , (0x9207, MeteringMode) + , (0x9208, LightSource) + , (0x9209, Flash) + , (0x920a, FocalLength) + , (0x927c, MakerNote) + , (0x9286, UserComment) + , (0xa000, FlashPixVersion) + , (0xa001, ColorSpace) + , (0xa002, ExifImageWidth) + , (0xa003, ExifImageHeight) + , (0xa004, RelatedSoundFile) + , (0xa005, ExifInteroperabilityOffset) + , (0xa20e, FocalPlaneXResolution) + , (0xa20f, FocalPlaneYResolution) + , (0xa210, FocalPlaneResolutionUnit) + , (0xa217, SensingMethod) + , (0xa300, FileSource) + , (0xa301, SceneType) + ] diff --git a/src/Text/Pandoc/Logging.hs b/src/Text/Pandoc/Logging.hs new file mode 100644 index 000000000..1f98d019e --- /dev/null +++ b/src/Text/Pandoc/Logging.hs @@ -0,0 +1,232 @@ +{-# LANGUAGE DeriveDataTypeable, DeriveGeneric, OverloadedStrings #-} +{- +Copyright (C) 2016-17 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} +{- | + Module : Text.Pandoc.Logging + Copyright : Copyright (C) 2006-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +This module provides data types and functions for warnings +and info messages. + +-} +module Text.Pandoc.Logging ( + Verbosity(..) + , LogMessage(..) + , encodeLogMessages + , showLogMessage + , messageVerbosity + ) where + +import Text.Parsec.Pos +import Data.Data (Data) +import Data.Generics (Typeable) +import GHC.Generics (Generic) +import qualified Data.Text as Text +import Data.Aeson +import Text.Pandoc.Definition +import Data.Aeson.Encode.Pretty (encodePretty', keyOrder, + defConfig, Config(..)) +import qualified Data.ByteString.Lazy as BL + +-- | Verbosity level. +data Verbosity = ERROR | WARNING | INFO | DEBUG + deriving (Show, Read, Eq, Data, Enum, Ord, Bounded, Typeable, Generic) + +instance ToJSON Verbosity where + toJSON x = toJSON (show x) + +data LogMessage = + SkippedContent String SourcePos + | CouldNotParseYamlMetadata String SourcePos + | DuplicateLinkReference String SourcePos + | DuplicateNoteReference String SourcePos + | ReferenceNotFound String SourcePos + | CircularReference String SourcePos + | ParsingUnescaped String SourcePos + | CouldNotLoadIncludeFile String SourcePos + | ParsingTrace String SourcePos + | InlineNotRendered Inline + | BlockNotRendered Block + | DocxParserWarning String + | CouldNotFetchResource String String + | CouldNotDetermineImageSize String String + | CouldNotDetermineMimeType String + | CouldNotConvertTeXMath String String + deriving (Show, Eq, Data, Ord, Typeable, Generic) + +instance ToJSON LogMessage where + toJSON x = object $ "verbosity" .= toJSON (messageVerbosity x) : + case x of + SkippedContent s pos -> + ["type" .= String "SkippedContent", + "contents" .= Text.pack s, + "source" .= Text.pack (sourceName pos), + "line" .= sourceLine pos, + "column" .= sourceColumn pos] + CouldNotParseYamlMetadata s pos -> + ["type" .= String "YamlSectionNotAnObject", + "message" .= Text.pack s, + "source" .= Text.pack (sourceName pos), + "line" .= toJSON (sourceLine pos), + "column" .= toJSON (sourceColumn pos)] + DuplicateLinkReference s pos -> + ["type" .= String "DuplicateLinkReference", + "contents" .= Text.pack s, + "source" .= Text.pack (sourceName pos), + "line" .= toJSON (sourceLine pos), + "column" .= toJSON (sourceColumn pos)] + DuplicateNoteReference s pos -> + ["type" .= String "DuplicateNoteReference", + "contents" .= Text.pack s, + "source" .= Text.pack (sourceName pos), + "line" .= toJSON (sourceLine pos), + "column" .= toJSON (sourceColumn pos)] + ReferenceNotFound s pos -> + ["type" .= String "ReferenceNotFound", + "contents" .= Text.pack s, + "source" .= Text.pack (sourceName pos), + "line" .= toJSON (sourceLine pos), + "column" .= toJSON (sourceColumn pos)] + CircularReference s pos -> + ["type" .= String "CircularReference", + "contents" .= Text.pack s, + "source" .= Text.pack (sourceName pos), + "line" .= toJSON (sourceLine pos), + "column" .= toJSON (sourceColumn pos)] + ParsingUnescaped s pos -> + ["type" .= String "ParsingUnescaped", + "contents" .= Text.pack s, + "source" .= Text.pack (sourceName pos), + "line" .= toJSON (sourceLine pos), + "column" .= toJSON (sourceColumn pos)] + CouldNotLoadIncludeFile fp pos -> + ["type" .= String "CouldNotLoadIncludeFile", + "path" .= Text.pack fp, + "source" .= Text.pack (sourceName pos), + "line" .= toJSON (sourceLine pos), + "column" .= toJSON (sourceColumn pos)] + ParsingTrace s pos -> + ["type" .= String "ParsingTrace", + "contents" .= Text.pack s, + "source" .= Text.pack (sourceName pos), + "line" .= sourceLine pos, + "column" .= sourceColumn pos] + InlineNotRendered il -> + ["type" .= String "InlineNotRendered", + "contents" .= toJSON il] + BlockNotRendered bl -> + ["type" .= String "BlockNotRendered", + "contents" .= toJSON bl] + DocxParserWarning s -> + ["type" .= String "DocxParserWarning", + "contents" .= Text.pack s] + CouldNotFetchResource fp s -> + ["type" .= String "CouldNotFetchResource", + "path" .= Text.pack fp, + "message" .= Text.pack s] + CouldNotDetermineImageSize fp s -> + ["type" .= String "CouldNotDetermineImageSize", + "path" .= Text.pack fp, + "message" .= Text.pack s] + CouldNotDetermineMimeType fp -> + ["type" .= String "CouldNotDetermineMimeType", + "path" .= Text.pack fp] + CouldNotConvertTeXMath s msg -> + ["type" .= String "CouldNotConvertTeXMath", + "contents" .= Text.pack s, + "message" .= Text.pack msg] + +showPos :: SourcePos -> String +showPos pos = sn ++ "line " ++ + show (sourceLine pos) ++ " column " ++ show (sourceColumn pos) + where sn = if sourceName pos == "source" || sourceName pos == "" + then "" + else sourceName pos ++ " " + +encodeLogMessages :: [LogMessage] -> BL.ByteString +encodeLogMessages ms = + encodePretty' defConfig{ confCompare = + keyOrder [ "type", "verbosity", "contents", "message", "path", + "source", "line", "column" ] } ms + +showLogMessage :: LogMessage -> String +showLogMessage msg = + case msg of + SkippedContent s pos -> + "Skipped '" ++ s ++ "' at " ++ showPos pos + CouldNotParseYamlMetadata s pos -> + "Could not parse YAML metadata at " ++ showPos pos ++ + if null s then "" else (": " ++ s) + DuplicateLinkReference s pos -> + "Duplicate link reference '" ++ s ++ "' at " ++ showPos pos + DuplicateNoteReference s pos -> + "Duplicate note reference '" ++ s ++ "' at " ++ showPos pos + ReferenceNotFound s pos -> + "Reference not found for '" ++ s ++ "' at " ++ showPos pos + CircularReference s pos -> + "Circular reference '" ++ s ++ "' at " ++ showPos pos + ParsingUnescaped s pos -> + "Parsing unescaped '" ++ s ++ "' at " ++ showPos pos + CouldNotLoadIncludeFile fp pos -> + "Could not load include file '" ++ fp ++ "' at " ++ showPos pos + ParsingTrace s pos -> + "Parsing trace at " ++ showPos pos ++ ": " ++ s + InlineNotRendered il -> + "Not rendering " ++ show il + BlockNotRendered bl -> + "Not rendering " ++ show bl + DocxParserWarning s -> + "Docx parser warning: " ++ s + CouldNotFetchResource fp s -> + "Could not fetch resource '" ++ fp ++ "'" ++ + if null s then "" else (": " ++ s) + CouldNotDetermineImageSize fp s -> + "Could not determine image size for '" ++ fp ++ "'" ++ + if null s then "" else (": " ++ s) + CouldNotDetermineMimeType fp -> + "Could not determine mime type for '" ++ fp ++ "'" + CouldNotConvertTeXMath s m -> + "Could not convert TeX math '" ++ s ++ "', rendering as TeX" ++ + if null m then "" else (':':'\n':m) + +messageVerbosity:: LogMessage -> Verbosity +messageVerbosity msg = + case msg of + SkippedContent{} -> INFO + CouldNotParseYamlMetadata{} -> WARNING + DuplicateLinkReference{} -> WARNING + DuplicateNoteReference{} -> WARNING + ReferenceNotFound{} -> WARNING + CircularReference{} -> WARNING + CouldNotLoadIncludeFile{} -> WARNING + ParsingUnescaped{} -> INFO + ParsingTrace{} -> DEBUG + InlineNotRendered{} -> INFO + BlockNotRendered{} -> INFO + DocxParserWarning{} -> WARNING + CouldNotFetchResource{} -> WARNING + CouldNotDetermineImageSize{} -> WARNING + CouldNotDetermineMimeType{} -> WARNING + CouldNotConvertTeXMath{} -> WARNING + + diff --git a/src/Text/Pandoc/MIME.hs b/src/Text/Pandoc/MIME.hs new file mode 100644 index 000000000..a08091217 --- /dev/null +++ b/src/Text/Pandoc/MIME.hs @@ -0,0 +1,527 @@ +{- +Copyright (C) 2011-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.MIME + Copyright : Copyright (C) 2011-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Mime type lookup for ODT writer. +-} +module Text.Pandoc.MIME ( MimeType, getMimeType, getMimeTypeDef, + extensionFromMimeType )where +import System.FilePath +import Data.Char ( toLower ) +import Data.List (isPrefixOf, isSuffixOf) +import Data.Maybe (fromMaybe) +import qualified Data.Map as M + +type MimeType = String + +-- | Determine mime type appropriate for file path. +getMimeType :: FilePath -> Maybe MimeType +getMimeType fp + -- ODT + | fp == "layout-cache" = + Just "application/binary" + | "Formula-" `isPrefixOf` fp && "/" `isSuffixOf` fp = + Just "application/vnd.oasis.opendocument.formula" + -- generic + | otherwise = M.lookup (map toLower $ drop 1 $ takeExtension fp) mimeTypes + +-- | Determime mime type appropriate for file path, defaulting to +-- “application/octet-stream” if nothing else fits. +getMimeTypeDef :: FilePath -> MimeType +getMimeTypeDef = fromMaybe "application/octet-stream" . getMimeType + +extensionFromMimeType :: MimeType -> Maybe String +extensionFromMimeType mimetype = + M.lookup (takeWhile (/=';') mimetype) reverseMimeTypes + -- note: we just look up the basic mime type, dropping the content-encoding etc. + +reverseMimeTypes :: M.Map MimeType String +reverseMimeTypes = M.fromList $ map (\(k,v) -> (v,k)) mimeTypesList + +mimeTypes :: M.Map String MimeType +mimeTypes = M.fromList mimeTypesList + +mimeTypesList :: [(String, MimeType)] +mimeTypesList = -- List borrowed from happstack-server. + [("gz","application/x-gzip") + ,("cabal","application/x-cabal") + ,("%","application/x-trash") + ,("323","text/h323") + ,("3gp","video/3gpp") + ,("7z","application/x-7z-compressed") + ,("abw","application/x-abiword") + ,("ai","application/postscript") + ,("aif","audio/x-aiff") + ,("aifc","audio/x-aiff") + ,("aiff","audio/x-aiff") + ,("alc","chemical/x-alchemy") + ,("art","image/x-jg") + ,("asc","text/plain") + ,("asf","video/x-ms-asf") + ,("asn","chemical/x-ncbi-asn1") + ,("aso","chemical/x-ncbi-asn1-binary") + ,("asx","video/x-ms-asf") + ,("atom","application/atom") + ,("atomcat","application/atomcat+xml") + ,("atomsrv","application/atomserv+xml") + ,("au","audio/basic") + ,("avi","video/x-msvideo") + ,("b","chemical/x-molconn-Z") + ,("bak","application/x-trash") + ,("bat","application/x-msdos-program") + ,("bcpio","application/x-bcpio") + ,("bib","text/x-bibtex") + ,("bin","application/octet-stream") + ,("bmp","image/x-ms-bmp") + ,("boo","text/x-boo") + ,("book","application/x-maker") + ,("bsd","chemical/x-crossfire") + ,("c","text/x-csrc") + ,("c++","text/x-c++src") + ,("c3d","chemical/x-chem3d") + ,("cab","application/x-cab") + ,("cac","chemical/x-cache") + ,("cache","chemical/x-cache") + ,("cap","application/cap") + ,("cascii","chemical/x-cactvs-binary") + ,("cat","application/vnd.ms-pki.seccat") + ,("cbin","chemical/x-cactvs-binary") + ,("cbr","application/x-cbr") + ,("cbz","application/x-cbz") + ,("cc","text/x-c++src") + ,("cdf","application/x-cdf") + ,("cdr","image/x-coreldraw") + ,("cdt","image/x-coreldrawtemplate") + ,("cdx","chemical/x-cdx") + ,("cdy","application/vnd.cinderella") + ,("cef","chemical/x-cxf") + ,("cer","chemical/x-cerius") + ,("chm","chemical/x-chemdraw") + ,("chrt","application/x-kchart") + ,("cif","chemical/x-cif") + ,("class","application/java-vm") + ,("cls","text/x-tex") + ,("cmdf","chemical/x-cmdf") + ,("cml","chemical/x-cml") + ,("cod","application/vnd.rim.cod") + ,("com","application/x-msdos-program") + ,("cpa","chemical/x-compass") + ,("cpio","application/x-cpio") + ,("cpp","text/x-c++src") + ,("cpt","application/mac-compactpro") + ,("crl","application/x-pkcs7-crl") + ,("crt","application/x-x509-ca-cert") + ,("csf","chemical/x-cache-csf") + ,("csh","application/x-csh") + ,("csm","chemical/x-csml") + ,("csml","chemical/x-csml") + ,("css","text/css") + ,("csv","text/csv") + ,("ctab","chemical/x-cactvs-binary") + ,("ctx","chemical/x-ctx") + ,("cu","application/cu-seeme") + ,("cub","chemical/x-gaussian-cube") + ,("cxf","chemical/x-cxf") + ,("cxx","text/x-c++src") + ,("d","text/x-dsrc") + ,("dat","chemical/x-mopac-input") + ,("dcr","application/x-director") + ,("deb","application/x-debian-package") + ,("dif","video/dv") + ,("diff","text/x-diff") + ,("dir","application/x-director") + ,("djv","image/vnd.djvu") + ,("djvu","image/vnd.djvu") + ,("dl","video/dl") + ,("dll","application/x-msdos-program") + ,("dmg","application/x-apple-diskimage") + ,("dms","application/x-dms") + ,("doc","application/msword") + ,("dot","application/msword") + ,("dv","video/dv") + ,("dvi","application/x-dvi") + ,("dx","chemical/x-jcamp-dx") + ,("dxr","application/x-director") + ,("emb","chemical/x-embl-dl-nucleotide") + ,("embl","chemical/x-embl-dl-nucleotide") + ,("emf","image/x-emf") + ,("eml","message/rfc822") + ,("ent","chemical/x-ncbi-asn1-ascii") + ,("eot","application/vnd.ms-fontobject") + ,("eps","application/postscript") + ,("etx","text/x-setext") + ,("exe","application/x-msdos-program") + ,("ez","application/andrew-inset") + ,("fb","application/x-maker") + ,("fbdoc","application/x-maker") + ,("fch","chemical/x-gaussian-checkpoint") + ,("fchk","chemical/x-gaussian-checkpoint") + ,("fig","application/x-xfig") + ,("flac","application/x-flac") + ,("fli","video/fli") + ,("fm","application/x-maker") + ,("frame","application/x-maker") + ,("frm","application/x-maker") + ,("fs","text/plain") + ,("gal","chemical/x-gaussian-log") + ,("gam","chemical/x-gamess-input") + ,("gamin","chemical/x-gamess-input") + ,("gau","chemical/x-gaussian-input") + ,("gcd","text/x-pcs-gcd") + ,("gcf","application/x-graphing-calculator") + ,("gcg","chemical/x-gcg8-sequence") + ,("gen","chemical/x-genbank") + ,("gf","application/x-tex-gf") + ,("gif","image/gif") + ,("gjc","chemical/x-gaussian-input") + ,("gjf","chemical/x-gaussian-input") + ,("gl","video/gl") + ,("gnumeric","application/x-gnumeric") + ,("gpt","chemical/x-mopac-graph") + ,("gsf","application/x-font") + ,("gsm","audio/x-gsm") + ,("gtar","application/x-gtar") + ,("h","text/x-chdr") + ,("h++","text/x-c++hdr") + ,("hdf","application/x-hdf") + ,("hh","text/x-c++hdr") + ,("hin","chemical/x-hin") + ,("hpp","text/x-c++hdr") + ,("hqx","application/mac-binhex40") + ,("hs","text/x-haskell") + ,("hta","application/hta") + ,("htc","text/x-component") + ,("htm","text/html") + ,("html","text/html") + ,("hxx","text/x-c++hdr") + ,("ica","application/x-ica") + ,("ice","x-conference/x-cooltalk") + ,("ico","image/x-icon") + ,("ics","text/calendar") + ,("icz","text/calendar") + ,("ief","image/ief") + ,("iges","model/iges") + ,("igs","model/iges") + ,("iii","application/x-iphone") + ,("inp","chemical/x-gamess-input") + ,("ins","application/x-internet-signup") + ,("iso","application/x-iso9660-image") + ,("isp","application/x-internet-signup") + ,("ist","chemical/x-isostar") + ,("istr","chemical/x-isostar") + ,("jad","text/vnd.sun.j2me.app-descriptor") + ,("jar","application/java-archive") + ,("java","text/x-java") + ,("jdx","chemical/x-jcamp-dx") + ,("jmz","application/x-jmol") + ,("jng","image/x-jng") + ,("jnlp","application/x-java-jnlp-file") + ,("jpe","image/jpeg") + ,("jpeg","image/jpeg") + ,("jfif","image/jpeg") + ,("jpg","image/jpeg") + ,("js","application/x-javascript") + ,("kar","audio/midi") + ,("key","application/pgp-keys") + ,("kil","application/x-killustrator") + ,("kin","chemical/x-kinemage") + ,("kml","application/vnd.google-earth.kml+xml") + ,("kmz","application/vnd.google-earth.kmz") + ,("kpr","application/x-kpresenter") + ,("kpt","application/x-kpresenter") + ,("ksp","application/x-kspread") + ,("kwd","application/x-kword") + ,("kwt","application/x-kword") + ,("latex","application/x-latex") + ,("lha","application/x-lha") + ,("lhs","text/x-literate-haskell") + ,("lsf","video/x-la-asf") + ,("lsx","video/x-la-asf") + ,("ltx","text/x-tex") + ,("lyx","application/x-lyx") + ,("lzh","application/x-lzh") + ,("lzx","application/x-lzx") + ,("m3u","audio/mpegurl") + ,("m4a","audio/mpeg") + ,("m4v","video/x-m4v") + ,("maker","application/x-maker") + ,("man","application/x-troff-man") + ,("mcif","chemical/x-mmcif") + ,("mcm","chemical/x-macmolecule") + ,("mdb","application/msaccess") + ,("me","application/x-troff-me") + ,("mesh","model/mesh") + ,("mid","audio/midi") + ,("midi","audio/midi") + ,("mif","application/x-mif") + ,("mm","application/x-freemind") + ,("mmd","chemical/x-macromodel-input") + ,("mmf","application/vnd.smaf") + ,("mml","text/mathml") + ,("mmod","chemical/x-macromodel-input") + ,("mng","video/x-mng") + ,("moc","text/x-moc") + ,("mol","chemical/x-mdl-molfile") + ,("mol2","chemical/x-mol2") + ,("moo","chemical/x-mopac-out") + ,("mop","chemical/x-mopac-input") + ,("mopcrt","chemical/x-mopac-input") + ,("mov","video/quicktime") + ,("movie","video/x-sgi-movie") + ,("mp2","audio/mpeg") + ,("mp3","audio/mpeg") + ,("mp4","video/mp4") + ,("mpc","chemical/x-mopac-input") + ,("mpe","video/mpeg") + ,("mpeg","video/mpeg") + ,("mpega","audio/mpeg") + ,("mpg","video/mpeg") + ,("mpga","audio/mpeg") + ,("ms","application/x-troff-ms") + ,("msh","model/mesh") + ,("msi","application/x-msi") + ,("mvb","chemical/x-mopac-vib") + ,("mxu","video/vnd.mpegurl") + ,("nb","application/mathematica") + ,("nc","application/x-netcdf") + ,("nwc","application/x-nwc") + ,("o","application/x-object") + ,("oda","application/oda") + ,("odb","application/vnd.oasis.opendocument.database") + ,("odc","application/vnd.oasis.opendocument.chart") + ,("odf","application/vnd.oasis.opendocument.formula") + ,("odg","application/vnd.oasis.opendocument.graphics") + ,("odi","application/vnd.oasis.opendocument.image") + ,("odm","application/vnd.oasis.opendocument.text-master") + ,("odp","application/vnd.oasis.opendocument.presentation") + ,("ods","application/vnd.oasis.opendocument.spreadsheet") + ,("odt","application/vnd.oasis.opendocument.text") + ,("oga","audio/ogg") + ,("ogg","application/ogg") + ,("ogv","video/ogg") + ,("ogx","application/ogg") + ,("old","application/x-trash") + ,("otg","application/vnd.oasis.opendocument.graphics-template") + ,("oth","application/vnd.oasis.opendocument.text-web") + ,("otp","application/vnd.oasis.opendocument.presentation-template") + ,("ots","application/vnd.oasis.opendocument.spreadsheet-template") + ,("otf","application/vnd.ms-opentype") + ,("ott","application/vnd.oasis.opendocument.text-template") + ,("oza","application/x-oz-application") + ,("p","text/x-pascal") + ,("p7r","application/x-pkcs7-certreqresp") + ,("pac","application/x-ns-proxy-autoconfig") + ,("pas","text/x-pascal") + ,("pat","image/x-coreldrawpattern") + ,("patch","text/x-diff") + ,("pbm","image/x-portable-bitmap") + ,("pcap","application/cap") + ,("pcf","application/x-font") + ,("pcf.Z","application/x-font") + ,("pcx","image/pcx") + ,("pdb","chemical/x-pdb") + ,("pdf","application/pdf") + ,("pfa","application/x-font") + ,("pfb","application/x-font") + ,("pgm","image/x-portable-graymap") + ,("pgn","application/x-chess-pgn") + ,("pgp","application/pgp-signature") + ,("php","application/x-httpd-php") + ,("php3","application/x-httpd-php3") + ,("php3p","application/x-httpd-php3-preprocessed") + ,("php4","application/x-httpd-php4") + ,("phps","application/x-httpd-php-source") + ,("pht","application/x-httpd-php") + ,("phtml","application/x-httpd-php") + ,("pk","application/x-tex-pk") + ,("pl","text/x-perl") + ,("pls","audio/x-scpls") + ,("pm","text/x-perl") + ,("png","image/png") + ,("pnm","image/x-portable-anymap") + ,("pot","text/plain") + ,("ppm","image/x-portable-pixmap") + ,("pps","application/vnd.ms-powerpoint") + ,("ppt","application/vnd.ms-powerpoint") + ,("prf","application/pics-rules") + ,("prt","chemical/x-ncbi-asn1-ascii") + ,("ps","application/postscript") + ,("psd","image/x-photoshop") + ,("py","text/x-python") + ,("pyc","application/x-python-code") + ,("pyo","application/x-python-code") + ,("qt","video/quicktime") + ,("qtl","application/x-quicktimeplayer") + ,("ra","audio/x-pn-realaudio") + ,("ram","audio/x-pn-realaudio") + ,("rar","application/rar") + ,("ras","image/x-cmu-raster") + ,("rd","chemical/x-mdl-rdfile") + ,("rdf","application/rdf+xml") + ,("rgb","image/x-rgb") + ,("rhtml","application/x-httpd-eruby") + ,("rm","audio/x-pn-realaudio") + ,("roff","application/x-troff") + ,("ros","chemical/x-rosdal") + ,("rpm","application/x-redhat-package-manager") + ,("rss","application/rss+xml") + ,("rtf","application/rtf") + ,("rtx","text/richtext") + ,("rxn","chemical/x-mdl-rxnfile") + ,("sct","text/scriptlet") + ,("sd","chemical/x-mdl-sdfile") + ,("sd2","audio/x-sd2") + ,("sda","application/vnd.stardivision.draw") + ,("sdc","application/vnd.stardivision.calc") + ,("sdd","application/vnd.stardivision.impress") + ,("sdf","application/vnd.stardivision.math") + ,("sds","application/vnd.stardivision.chart") + ,("sdw","application/vnd.stardivision.writer") + ,("ser","application/java-serialized-object") + ,("sgf","application/x-go-sgf") + ,("sgl","application/vnd.stardivision.writer-global") + ,("sh","application/x-sh") + ,("shar","application/x-shar") + ,("shtml","text/html") + ,("sid","audio/prs.sid") + ,("sik","application/x-trash") + ,("silo","model/mesh") + ,("sis","application/vnd.symbian.install") + ,("sisx","x-epoc/x-sisx-app") + ,("sit","application/x-stuffit") + ,("sitx","application/x-stuffit") + ,("skd","application/x-koan") + ,("skm","application/x-koan") + ,("skp","application/x-koan") + ,("skt","application/x-koan") + ,("smi","application/smil") + ,("smil","application/smil") + ,("snd","audio/basic") + ,("spc","chemical/x-galactic-spc") + ,("spl","application/futuresplash") + ,("spx","audio/ogg") + ,("src","application/x-wais-source") + ,("stc","application/vnd.sun.xml.calc.template") + ,("std","application/vnd.sun.xml.draw.template") + ,("sti","application/vnd.sun.xml.impress.template") + ,("stl","application/vnd.ms-pki.stl") + ,("stw","application/vnd.sun.xml.writer.template") + ,("sty","text/x-tex") + ,("sv4cpio","application/x-sv4cpio") + ,("sv4crc","application/x-sv4crc") + ,("svg","image/svg+xml") + -- removed for now, since it causes problems with + -- extensionFromMimeType: see #2183. + -- ,("svgz","image/svg+xml") + ,("sw","chemical/x-swissprot") + ,("swf","application/x-shockwave-flash") + ,("swfl","application/x-shockwave-flash") + ,("sxc","application/vnd.sun.xml.calc") + ,("sxd","application/vnd.sun.xml.draw") + ,("sxg","application/vnd.sun.xml.writer.global") + ,("sxi","application/vnd.sun.xml.impress") + ,("sxm","application/vnd.sun.xml.math") + ,("sxw","application/vnd.sun.xml.writer") + ,("t","application/x-troff") + ,("tar","application/x-tar") + ,("taz","application/x-gtar") + ,("tcl","application/x-tcl") + ,("tex","text/x-tex") + ,("texi","application/x-texinfo") + ,("texinfo","application/x-texinfo") + ,("text","text/plain") + ,("tgf","chemical/x-mdl-tgf") + ,("tgz","application/x-gtar") + ,("tif","image/tiff") + ,("tiff","image/tiff") + ,("tk","text/x-tcl") + ,("tm","text/texmacs") + ,("torrent","application/x-bittorrent") + ,("tr","application/x-troff") + ,("ts","text/texmacs") + ,("tsp","application/dsptype") + ,("tsv","text/tab-separated-values") + ,("ttf","application/x-font-truetype") + ,("txt","text/plain") + ,("udeb","application/x-debian-package") + ,("uls","text/iuls") + ,("ustar","application/x-ustar") + ,("val","chemical/x-ncbi-asn1-binary") + ,("vcd","application/x-cdlink") + ,("vcf","text/x-vcard") + ,("vcs","text/x-vcalendar") + ,("vmd","chemical/x-vmd") + ,("vms","chemical/x-vamas-iso14976") + ,("vrm","x-world/x-vrml") + ,("vrml","model/vrml") + ,("vs","text/plain") + ,("vsd","application/vnd.visio") + ,("vtt","text/vtt") + ,("wad","application/x-doom") + ,("wav","audio/x-wav") + ,("wax","audio/x-ms-wax") + ,("wbmp","image/vnd.wap.wbmp") + ,("wbxml","application/vnd.wap.wbxml") + ,("webm","video/webm") + ,("wk","application/x-123") + ,("wm","video/x-ms-wm") + ,("wma","audio/x-ms-wma") + ,("wmd","application/x-ms-wmd") + ,("wmf","image/x-wmf") + ,("wml","text/vnd.wap.wml") + ,("wmlc","application/vnd.wap.wmlc") + ,("wmls","text/vnd.wap.wmlscript") + ,("wmlsc","application/vnd.wap.wmlscriptc") + ,("wmv","video/x-ms-wmv") + ,("wmx","video/x-ms-wmx") + ,("wmz","application/x-ms-wmz") + ,("woff","application/font-woff") + ,("woff2","font/woff2") + ,("wp5","application/wordperfect5.1") + ,("wpd","application/wordperfect") + ,("wrl","model/vrml") + ,("wsc","text/scriptlet") + ,("wvx","video/x-ms-wvx") + ,("wz","application/x-wingz") + ,("xbm","image/x-xbitmap") + ,("xcf","application/x-xcf") + ,("xht","application/xhtml+xml") + ,("xhtml","application/xhtml+xml") + ,("xlb","application/vnd.ms-excel") + ,("xls","application/vnd.ms-excel") + ,("xlt","application/vnd.ms-excel") + ,("xml","application/xml") + ,("xpi","application/x-xpinstall") + ,("xpm","image/x-xpixmap") + ,("xsl","application/xml") + ,("xtel","chemical/x-xtel") + ,("xul","application/vnd.mozilla.xul+xml") + ,("xwd","image/x-xwindowdump") + ,("xyz","chemical/x-xyz") + ,("zip","application/zip") + ,("zmt","chemical/x-mopac-input") + ] + diff --git a/src/Text/Pandoc/MediaBag.hs b/src/Text/Pandoc/MediaBag.hs new file mode 100644 index 000000000..fe99be5fe --- /dev/null +++ b/src/Text/Pandoc/MediaBag.hs @@ -0,0 +1,113 @@ +{-# LANGUAGE GeneralizedNewtypeDeriving, DeriveDataTypeable #-} +{- +Copyright (C) 2014 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.MediaBag + Copyright : Copyright (C) 2014 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Definition of a MediaBag object to hold binary resources, and an +interface for interacting with it. +-} +module Text.Pandoc.MediaBag ( + MediaBag, + lookupMedia, + insertMedia, + mediaDirectory, + extractMediaBag + ) where +import System.FilePath +import qualified System.FilePath.Posix as Posix +import System.Directory (createDirectoryIfMissing) +import qualified Data.Map as M +import qualified Data.ByteString.Lazy as BL +import Control.Monad (when) +import Control.Monad.Trans (MonadIO(..)) +import Text.Pandoc.MIME (MimeType, getMimeTypeDef) +import qualified Text.Pandoc.UTF8 as UTF8 +import Data.Maybe (fromMaybe) +import System.IO (stderr) +import Data.Data (Data) +import Data.Typeable (Typeable) + +-- | A container for a collection of binary resources, with names and +-- mime types. Note that a 'MediaBag' is a Monoid, so 'mempty' +-- can be used for an empty 'MediaBag', and '<>' can be used to append +-- two 'MediaBag's. +newtype MediaBag = MediaBag (M.Map [String] (MimeType, BL.ByteString)) + deriving (Monoid, Data, Typeable) + +instance Show MediaBag where + show bag = "MediaBag " ++ show (mediaDirectory bag) + +-- | Insert a media item into a 'MediaBag', replacing any existing +-- value with the same name. +insertMedia :: FilePath -- ^ relative path and canonical name of resource + -> Maybe MimeType -- ^ mime type (Nothing = determine from extension) + -> BL.ByteString -- ^ contents of resource + -> MediaBag + -> MediaBag +insertMedia fp mbMime contents (MediaBag mediamap) = + MediaBag (M.insert (splitDirectories fp) (mime, contents) mediamap) + where mime = fromMaybe fallback mbMime + fallback = case takeExtension fp of + ".gz" -> getMimeTypeDef $ dropExtension fp + _ -> getMimeTypeDef fp + +-- | Lookup a media item in a 'MediaBag', returning mime type and contents. +lookupMedia :: FilePath + -> MediaBag + -> Maybe (MimeType, BL.ByteString) +lookupMedia fp (MediaBag mediamap) = M.lookup (splitDirectories fp) mediamap + +-- | Get a list of the file paths stored in a 'MediaBag', with +-- their corresponding mime types and the lengths in bytes of the contents. +mediaDirectory :: MediaBag -> [(String, MimeType, Int)] +mediaDirectory (MediaBag mediamap) = + M.foldWithKey (\fp (mime,contents) -> + (((Posix.joinPath fp), mime, fromIntegral $ BL.length contents):)) [] mediamap + +-- | Extract contents of MediaBag to a given directory. Print informational +-- messages if 'verbose' is true. +-- TODO: eventually we may want to put this into PandocMonad +-- In PandocPure, it could write to the fake file system... +extractMediaBag :: MonadIO m + => Bool + -> FilePath + -> MediaBag + -> m () +extractMediaBag verbose dir (MediaBag mediamap) = liftIO $ do + sequence_ $ M.foldWithKey + (\fp (_ ,contents) -> + ((writeMedia verbose dir (Posix.joinPath fp, contents)):)) [] mediamap + +writeMedia :: Bool -> FilePath -> (FilePath, BL.ByteString) -> IO () +writeMedia verbose dir (subpath, bs) = do + -- we join and split to convert a/b/c to a\b\c on Windows; + -- in zip containers all paths use / + let fullpath = dir </> normalise subpath + createDirectoryIfMissing True $ takeDirectory fullpath + when verbose $ UTF8.hPutStrLn stderr $ "pandoc: extracting " ++ fullpath + BL.writeFile fullpath bs + + diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs new file mode 100644 index 000000000..bc62f87d0 --- /dev/null +++ b/src/Text/Pandoc/Options.hs @@ -0,0 +1,217 @@ +{-# LANGUAGE DeriveDataTypeable, DeriveGeneric #-} +{- +Copyright (C) 2012-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Options + Copyright : Copyright (C) 2012-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Data structures and functions for representing parser and writer +options. +-} +module Text.Pandoc.Options ( module Text.Pandoc.Extensions + , ReaderOptions(..) + , HTMLMathMethod (..) + , CiteMethod (..) + , ObfuscationMethod (..) + , HTMLSlideVariant (..) + , EPUBVersion (..) + , WrapOption (..) + , TopLevelDivision (..) + , WriterOptions (..) + , TrackChanges (..) + , ReferenceLocation (..) + , def + , isEnabled + ) where +import Text.Pandoc.Extensions +import Data.Default +import Text.Pandoc.Highlighting (Style, pygments) +import Data.Data (Data) +import Data.Typeable (Typeable) +import GHC.Generics (Generic) + +data ReaderOptions = ReaderOptions{ + readerExtensions :: Extensions -- ^ Syntax extensions + , readerStandalone :: Bool -- ^ Standalone document with header + , readerColumns :: Int -- ^ Number of columns in terminal + , readerTabStop :: Int -- ^ Tab stop + , readerApplyMacros :: Bool -- ^ Apply macros to TeX math + , readerIndentedCodeClasses :: [String] -- ^ Default classes for + -- indented code blocks + , readerDefaultImageExtension :: String -- ^ Default extension for images + , readerTrackChanges :: TrackChanges +} deriving (Show, Read, Data, Typeable, Generic) + +instance Default ReaderOptions + where def = ReaderOptions{ + readerExtensions = emptyExtensions + , readerStandalone = False + , readerColumns = 80 + , readerTabStop = 4 + , readerApplyMacros = True + , readerIndentedCodeClasses = [] + , readerDefaultImageExtension = "" + , readerTrackChanges = AcceptChanges + } + +-- +-- Writer options +-- + +data EPUBVersion = EPUB2 | EPUB3 deriving (Eq, Show, Read, Data, Typeable, Generic) + +data HTMLMathMethod = PlainMath + | LaTeXMathML (Maybe String) -- url of LaTeXMathML.js + | JsMath (Maybe String) -- url of jsMath load script + | GladTeX + | WebTeX String -- url of TeX->image script. + | MathML + | MathJax String -- url of MathJax.js + | KaTeX String String -- url of stylesheet and katex.js + deriving (Show, Read, Eq, Data, Typeable, Generic) + +data CiteMethod = Citeproc -- use citeproc to render them + | Natbib -- output natbib cite commands + | Biblatex -- output biblatex cite commands + deriving (Show, Read, Eq, Data, Typeable, Generic) + +-- | Methods for obfuscating email addresses in HTML. +data ObfuscationMethod = NoObfuscation + | ReferenceObfuscation + | JavascriptObfuscation + deriving (Show, Read, Eq, Data, Typeable, Generic) + +-- | Varieties of HTML slide shows. +data HTMLSlideVariant = S5Slides + | SlidySlides + | SlideousSlides + | DZSlides + | RevealJsSlides + | NoSlides + deriving (Show, Read, Eq, Data, Typeable, Generic) + +-- | Options for accepting or rejecting MS Word track-changes. +data TrackChanges = AcceptChanges + | RejectChanges + | AllChanges + deriving (Show, Read, Eq, Data, Typeable, Generic) + +-- | Options for wrapping text in the output. +data WrapOption = WrapAuto -- ^ Automatically wrap to width + | WrapNone -- ^ No non-semantic newlines + | WrapPreserve -- ^ Preserve wrapping of input source + deriving (Show, Read, Eq, Data, Typeable, Generic) + +-- | Options defining the type of top-level headers. +data TopLevelDivision = TopLevelPart -- ^ Top-level headers become parts + | TopLevelChapter -- ^ Top-level headers become chapters + | TopLevelSection -- ^ Top-level headers become sections + | TopLevelDefault -- ^ Top-level type is determined via + -- heuristics + deriving (Show, Read, Eq, Data, Typeable, Generic) + +-- | Locations for footnotes and references in markdown output +data ReferenceLocation = EndOfBlock -- ^ End of block + | EndOfSection -- ^ prior to next section header (or end of document) + | EndOfDocument -- ^ at end of document + deriving (Show, Read, Eq, Data, Typeable, Generic) + +-- | Options for writers +data WriterOptions = WriterOptions + { writerTemplate :: Maybe String -- ^ Template to use + , writerVariables :: [(String, String)] -- ^ Variables to set in template + , writerTabStop :: Int -- ^ Tabstop for conversion btw spaces and tabs + , writerTableOfContents :: Bool -- ^ Include table of contents + , writerIncremental :: Bool -- ^ True if lists should be incremental + , writerHTMLMathMethod :: HTMLMathMethod -- ^ How to print math in HTML + , writerNumberSections :: Bool -- ^ Number sections in LaTeX + , writerNumberOffset :: [Int] -- ^ Starting number for section, subsection, ... + , writerSectionDivs :: Bool -- ^ Put sections in div tags in HTML + , writerExtensions :: Extensions -- ^ Markdown extensions that can be used + , writerReferenceLinks :: Bool -- ^ Use reference links in writing markdown, rst + , writerDpi :: Int -- ^ Dpi for pixel to/from inch/cm conversions + , writerWrapText :: WrapOption -- ^ Option for wrapping text + , writerColumns :: Int -- ^ Characters in a line (for text wrapping) + , writerEmailObfuscation :: ObfuscationMethod -- ^ How to obfuscate emails + , writerIdentifierPrefix :: String -- ^ Prefix for section & note ids in HTML + -- and for footnote marks in markdown + , writerSourceURL :: Maybe String -- ^ Absolute URL + directory of 1st source file + , writerUserDataDir :: Maybe FilePath -- ^ Path of user data directory + , writerCiteMethod :: CiteMethod -- ^ How to print cites + , writerHtmlQTags :: Bool -- ^ Use @<q>@ tags for quotes in HTML + , writerSlideLevel :: Maybe Int -- ^ Force header level of slides + , writerTopLevelDivision :: TopLevelDivision -- ^ Type of top-level divisions + , writerListings :: Bool -- ^ Use listings package for code + , writerHighlightStyle :: Maybe Style -- ^ Style to use for highlighting + -- (Nothing = no highlighting) + , writerSetextHeaders :: Bool -- ^ Use setext headers for levels 1-2 in markdown + , writerEpubMetadata :: Maybe String -- ^ Metadata to include in EPUB + , writerEpubStylesheet :: Maybe String -- ^ EPUB stylesheet specified at command line + , writerEpubFonts :: [FilePath] -- ^ Paths to fonts to embed + , writerEpubChapterLevel :: Int -- ^ Header level for chapters (separate files) + , writerTOCDepth :: Int -- ^ Number of levels to include in TOC + , writerReferenceDoc :: Maybe FilePath -- ^ Path to reference document if specified + , writerLaTeXArgs :: [String] -- ^ Flags to pass to latex-engine + , writerReferenceLocation :: ReferenceLocation -- ^ Location of footnotes and references for writing markdown + } deriving (Show, Data, Typeable, Generic) + +instance Default WriterOptions where + def = WriterOptions { writerTemplate = Nothing + , writerVariables = [] + , writerTabStop = 4 + , writerTableOfContents = False + , writerIncremental = False + , writerHTMLMathMethod = PlainMath + , writerNumberSections = False + , writerNumberOffset = [0,0,0,0,0,0] + , writerSectionDivs = False + , writerExtensions = emptyExtensions + , writerReferenceLinks = False + , writerDpi = 96 + , writerWrapText = WrapAuto + , writerColumns = 72 + , writerEmailObfuscation = NoObfuscation + , writerIdentifierPrefix = "" + , writerSourceURL = Nothing + , writerUserDataDir = Nothing + , writerCiteMethod = Citeproc + , writerHtmlQTags = False + , writerSlideLevel = Nothing + , writerTopLevelDivision = TopLevelDefault + , writerListings = False + , writerHighlightStyle = Just pygments + , writerSetextHeaders = True + , writerEpubMetadata = Nothing + , writerEpubStylesheet = Nothing + , writerEpubFonts = [] + , writerEpubChapterLevel = 1 + , writerTOCDepth = 3 + , writerReferenceDoc = Nothing + , writerLaTeXArgs = [] + , writerReferenceLocation = EndOfDocument + } + +-- | Returns True if the given extension is enabled. +isEnabled :: Extension -> WriterOptions -> Bool +isEnabled ext opts = ext `extensionEnabled` (writerExtensions opts) diff --git a/src/Text/Pandoc/PDF.hs b/src/Text/Pandoc/PDF.hs new file mode 100644 index 000000000..1b3b4eb88 --- /dev/null +++ b/src/Text/Pandoc/PDF.hs @@ -0,0 +1,369 @@ +{-# LANGUAGE OverloadedStrings, CPP, ScopedTypeVariables #-} +{- +Copyright (C) 2012-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.PDF + Copyright : Copyright (C) 2012-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of LaTeX documents to PDF. +-} +module Text.Pandoc.PDF ( makePDF ) where + +import Data.ByteString.Lazy (ByteString) +import qualified Data.ByteString.Lazy as B +import qualified Data.ByteString.Lazy.Char8 as BC +import qualified Data.ByteString as BS +import Data.Monoid ((<>)) +import System.Exit (ExitCode (..)) +import System.FilePath +import System.IO (stdout) +import System.IO.Temp (withTempFile) +import System.Directory +import Data.Digest.Pure.SHA (showDigest, sha1) +import System.Environment +import Control.Monad (unless, when, (<=<)) +import qualified Control.Exception as E +import Data.List (isInfixOf) +import Data.Maybe (fromMaybe) +import qualified Text.Pandoc.UTF8 as UTF8 +import Text.Pandoc.Definition +import Text.Pandoc.MediaBag +import Text.Pandoc.Walk (walkM) +import Text.Pandoc.Shared (warn, withTempDir, inDirectory, stringify) +import Text.Pandoc.Writers.Shared (getField, metaToJSON) +import Text.Pandoc.Options (WriterOptions(..), HTMLMathMethod(..)) +import Text.Pandoc.Logging (Verbosity(..)) +import Text.Pandoc.MIME (extensionFromMimeType, getMimeType) +import Text.Pandoc.Process (pipeProcess) +import Control.Monad.Trans (MonadIO(..)) +import qualified Data.ByteString.Lazy as BL +import qualified Codec.Picture as JP +#ifdef _WINDOWS +import Data.List (intercalate) +#endif +import Text.Pandoc.Class (PandocIO, runIOorExplode, fetchItem, setMediaBag, runIO) + +#ifdef _WINDOWS +changePathSeparators :: FilePath -> FilePath +changePathSeparators = intercalate "/" . splitDirectories +#endif + +makePDF :: MonadIO m + => String -- ^ pdf creator (pdflatex, lualatex, + -- xelatex, context, wkhtmltopdf) + -> (WriterOptions -> Pandoc -> PandocIO String) -- ^ writer + -> WriterOptions -- ^ options + -> Verbosity -- ^ verbosity level + -> MediaBag -- ^ media + -> Pandoc -- ^ document + -> m (Either ByteString ByteString) +makePDF "wkhtmltopdf" writer opts verbosity _ doc@(Pandoc meta _) = liftIO $ do + let mathArgs = case writerHTMLMathMethod opts of + -- with MathJax, wait til all math is rendered: + MathJax _ -> ["--run-script", "MathJax.Hub.Register.StartupHook('End Typeset', function() { window.status = 'mathjax_loaded' });", + "--window-status", "mathjax_loaded"] + _ -> [] + meta' <- metaToJSON opts (return . stringify) (return . stringify) meta + let toArgs (f, mbd) = maybe [] (\d -> ['-':'-':f, d]) mbd + let args = mathArgs ++ + concatMap toArgs + [("page-size", getField "papersize" meta') + ,("title", getField "title" meta') + ,("margin-bottom", fromMaybe (Just "1.2in") + (getField "margin-bottom" meta')) + ,("margin-top", fromMaybe (Just "1.25in") + (getField "margin-top" meta')) + ,("margin-right", fromMaybe (Just "1.25in") + (getField "margin-right" meta')) + ,("margin-left", fromMaybe (Just "1.25in") + (getField "margin-left" meta')) + ] + source <- runIOorExplode $ writer opts doc + html2pdf verbosity args source +makePDF program writer opts verbosity mediabag doc = + liftIO $ withTempDir "tex2pdf." $ \tmpdir -> do + doc' <- handleImages opts mediabag tmpdir doc + source <- runIOorExplode $ writer opts doc' + let args = writerLaTeXArgs opts + case takeBaseName program of + "context" -> context2pdf verbosity tmpdir source + prog | prog `elem` ["pdflatex", "lualatex", "xelatex"] + -> tex2pdf' verbosity args tmpdir program source + _ -> return $ Left $ UTF8.fromStringLazy $ "Unknown program " ++ program + +handleImages :: WriterOptions + -> MediaBag + -> FilePath -- ^ temp dir to store images + -> Pandoc -- ^ document + -> IO Pandoc +handleImages opts mediabag tmpdir = + walkM (convertImages tmpdir) <=< walkM (handleImage' opts mediabag tmpdir) + +handleImage' :: WriterOptions + -> MediaBag + -> FilePath + -> Inline + -> IO Inline +handleImage' opts mediabag tmpdir (Image attr ils (src,tit)) = do + exists <- doesFileExist src + if exists + then return $ Image attr ils (src,tit) + else do + res <- runIO $ do + setMediaBag mediabag + fetchItem (writerSourceURL opts) src + case res of + Right (contents, Just mime) -> do + let ext = fromMaybe (takeExtension src) $ + extensionFromMimeType mime + let basename = showDigest $ sha1 $ BL.fromChunks [contents] + let fname = tmpdir </> basename <.> ext + BS.writeFile fname contents + return $ Image attr ils (fname,tit) + _ -> do + warn $ "Could not find image `" ++ src ++ "', skipping..." + -- return alt text + return $ Emph ils +handleImage' _ _ _ x = return x + +convertImages :: FilePath -> Inline -> IO Inline +convertImages tmpdir (Image attr ils (src, tit)) = do + img <- convertImage tmpdir src + newPath <- + case img of + Left e -> src <$ warn e + Right fp -> return fp + return (Image attr ils (newPath, tit)) +convertImages _ x = return x + +-- Convert formats which do not work well in pdf to png +convertImage :: FilePath -> FilePath -> IO (Either String FilePath) +convertImage tmpdir fname = + case mime of + Just "image/png" -> doNothing + Just "image/jpeg" -> doNothing + Just "application/pdf" -> doNothing + _ -> JP.readImage fname >>= \res -> + case res of + Left _ -> return $ Left $ "Unable to convert `" ++ + fname ++ "' for use with pdflatex." + Right img -> + E.catch (Right fileOut <$ JP.savePngImage fileOut img) $ + \(e :: E.SomeException) -> return (Left (show e)) + where + fileOut = replaceDirectory (replaceExtension fname ".png") tmpdir + mime = getMimeType fname + doNothing = return (Right fname) + +tex2pdf' :: Verbosity -- ^ Verbosity level + -> [String] -- ^ Arguments to the latex-engine + -> FilePath -- ^ temp directory for output + -> String -- ^ tex program + -> String -- ^ tex source + -> IO (Either ByteString ByteString) +tex2pdf' verbosity args tmpDir program source = do + let numruns = if "\\tableofcontents" `isInfixOf` source + then 3 -- to get page numbers + else 2 -- 1 run won't give you PDF bookmarks + (exit, log', mbPdf) <- runTeXProgram verbosity program args 1 numruns tmpDir source + case (exit, mbPdf) of + (ExitFailure _, _) -> do + let logmsg = extractMsg log' + let extramsg = + case logmsg of + x | "! Package inputenc Error" `BC.isPrefixOf` x + && program /= "xelatex" + -> "\nTry running pandoc with --latex-engine=xelatex." + _ -> "" + return $ Left $ logmsg <> extramsg + (ExitSuccess, Nothing) -> return $ Left "" + (ExitSuccess, Just pdf) -> return $ Right pdf + +-- parsing output + +extractMsg :: ByteString -> ByteString +extractMsg log' = do + let msg' = dropWhile (not . ("!" `BC.isPrefixOf`)) $ BC.lines log' + let (msg'',rest) = break ("l." `BC.isPrefixOf`) msg' + let lineno = take 1 rest + if null msg' + then log' + else BC.unlines (msg'' ++ lineno) + +extractConTeXtMsg :: ByteString -> ByteString +extractConTeXtMsg log' = do + let msg' = take 1 $ + dropWhile (not . ("tex error" `BC.isPrefixOf`)) $ BC.lines log' + if null msg' + then log' + else BC.unlines msg' + +-- running tex programs + +-- Run a TeX program on an input bytestring and return (exit code, +-- contents of stdout, contents of produced PDF if any). Rerun +-- a fixed number of times to resolve references. +runTeXProgram :: Verbosity -> String -> [String] -> Int -> Int -> FilePath + -> String -> IO (ExitCode, ByteString, Maybe ByteString) +runTeXProgram verbosity program args runNumber numRuns tmpDir source = do + let file = tmpDir </> "input.tex" + exists <- doesFileExist file + unless exists $ UTF8.writeFile file source +#ifdef _WINDOWS + -- note: we want / even on Windows, for TexLive + let tmpDir' = changePathSeparators tmpDir + let file' = changePathSeparators file +#else + let tmpDir' = tmpDir + let file' = file +#endif + let programArgs = ["-halt-on-error", "-interaction", "nonstopmode", + "-output-directory", tmpDir'] ++ args ++ [file'] + env' <- getEnvironment + let sep = [searchPathSeparator] + let texinputs = maybe (tmpDir' ++ sep) ((tmpDir' ++ sep) ++) + $ lookup "TEXINPUTS" env' + let env'' = ("TEXINPUTS", texinputs) : + [(k,v) | (k,v) <- env', k /= "TEXINPUTS"] + when (verbosity >= INFO && runNumber == 1) $ do + putStrLn "[makePDF] temp dir:" + putStrLn tmpDir' + putStrLn "[makePDF] Command line:" + putStrLn $ program ++ " " ++ unwords (map show programArgs) + putStr "\n" + putStrLn "[makePDF] Environment:" + mapM_ print env'' + putStr "\n" + putStrLn $ "[makePDF] Contents of " ++ file' ++ ":" + B.readFile file' >>= B.putStr + putStr "\n" + (exit, out) <- pipeProcess (Just env'') program programArgs BL.empty + when (verbosity >= INFO) $ do + putStrLn $ "[makePDF] Run #" ++ show runNumber + B.hPutStr stdout out + putStr "\n" + if runNumber <= numRuns + then runTeXProgram verbosity program args (runNumber + 1) numRuns tmpDir source + else do + let pdfFile = replaceDirectory (replaceExtension file ".pdf") tmpDir + pdfExists <- doesFileExist pdfFile + pdf <- if pdfExists + -- We read PDF as a strict bytestring to make sure that the + -- temp directory is removed on Windows. + -- See https://github.com/jgm/pandoc/issues/1192. + then (Just . B.fromChunks . (:[])) `fmap` BS.readFile pdfFile + else return Nothing + return (exit, out, pdf) + +html2pdf :: Verbosity -- ^ Verbosity level + -> [String] -- ^ Args to wkhtmltopdf + -> String -- ^ HTML5 source + -> IO (Either ByteString ByteString) +html2pdf verbosity args source = do + file <- withTempFile "." "html2pdf.html" $ \fp _ -> return fp + pdfFile <- withTempFile "." "html2pdf.pdf" $ \fp _ -> return fp + UTF8.writeFile file source + let programArgs = args ++ [file, pdfFile] + env' <- getEnvironment + when (verbosity >= INFO) $ do + putStrLn "[makePDF] Command line:" + putStrLn $ "wkhtmltopdf" ++ " " ++ unwords (map show programArgs) + putStr "\n" + putStrLn "[makePDF] Environment:" + mapM_ print env' + putStr "\n" + putStrLn $ "[makePDF] Contents of " ++ file ++ ":" + B.readFile file >>= B.putStr + putStr "\n" + (exit, out) <- pipeProcess (Just env') "wkhtmltopdf" programArgs BL.empty + removeFile file + when (verbosity >= INFO) $ do + B.hPutStr stdout out + putStr "\n" + pdfExists <- doesFileExist pdfFile + mbPdf <- if pdfExists + -- We read PDF as a strict bytestring to make sure that the + -- temp directory is removed on Windows. + -- See https://github.com/jgm/pandoc/issues/1192. + then do + res <- (Just . B.fromChunks . (:[])) `fmap` BS.readFile pdfFile + removeFile pdfFile + return res + else return Nothing + return $ case (exit, mbPdf) of + (ExitFailure _, _) -> Left out + (ExitSuccess, Nothing) -> Left "" + (ExitSuccess, Just pdf) -> Right pdf + +context2pdf :: Verbosity -- ^ Verbosity level + -> FilePath -- ^ temp directory for output + -> String -- ^ ConTeXt source + -> IO (Either ByteString ByteString) +context2pdf verbosity tmpDir source = inDirectory tmpDir $ do + let file = "input.tex" + UTF8.writeFile file source +#ifdef _WINDOWS + -- note: we want / even on Windows, for TexLive + let tmpDir' = changePathSeparators tmpDir +#else + let tmpDir' = tmpDir +#endif + let programArgs = "--batchmode" : [file] + env' <- getEnvironment + let sep = [searchPathSeparator] + let texinputs = maybe (".." ++ sep) ((".." ++ sep) ++) + $ lookup "TEXINPUTS" env' + let env'' = ("TEXINPUTS", texinputs) : + [(k,v) | (k,v) <- env', k /= "TEXINPUTS"] + when (verbosity >= INFO) $ do + putStrLn "[makePDF] temp dir:" + putStrLn tmpDir' + putStrLn "[makePDF] Command line:" + putStrLn $ "context" ++ " " ++ unwords (map show programArgs) + putStr "\n" + putStrLn "[makePDF] Environment:" + mapM_ print env'' + putStr "\n" + putStrLn $ "[makePDF] Contents of " ++ file ++ ":" + B.readFile file >>= B.putStr + putStr "\n" + (exit, out) <- pipeProcess (Just env'') "context" programArgs BL.empty + when (verbosity >= INFO) $ do + B.hPutStr stdout out + putStr "\n" + let pdfFile = replaceExtension file ".pdf" + pdfExists <- doesFileExist pdfFile + mbPdf <- if pdfExists + -- We read PDF as a strict bytestring to make sure that the + -- temp directory is removed on Windows. + -- See https://github.com/jgm/pandoc/issues/1192. + then (Just . B.fromChunks . (:[])) `fmap` BS.readFile pdfFile + else return Nothing + case (exit, mbPdf) of + (ExitFailure _, _) -> do + let logmsg = extractConTeXtMsg out + return $ Left logmsg + (ExitSuccess, Nothing) -> return $ Left "" + (ExitSuccess, Just pdf) -> return $ Right pdf + diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs new file mode 100644 index 000000000..400d07f2a --- /dev/null +++ b/src/Text/Pandoc/Parsing.hs @@ -0,0 +1,1329 @@ +{-# LANGUAGE + FlexibleContexts +, GeneralizedNewtypeDeriving +, TypeSynonymInstances +, MultiParamTypeClasses +, FlexibleInstances +, IncoherentInstances #-} + +{- +Copyright (C) 2006-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Parsing + Copyright : Copyright (C) 2006-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +A utility library with parsers used in pandoc readers. +-} +module Text.Pandoc.Parsing ( anyLine, + many1Till, + notFollowedBy', + oneOfStrings, + oneOfStringsCI, + spaceChar, + nonspaceChar, + skipSpaces, + blankline, + blanklines, + enclosed, + stringAnyCase, + parseFromString, + lineClump, + charsInBalanced, + romanNumeral, + emailAddress, + uri, + mathInline, + mathDisplay, + withHorizDisplacement, + withRaw, + escaped, + characterReference, + anyOrderedListMarker, + orderedListMarker, + charRef, + lineBlockLines, + tableWith, + widthsFromIndices, + gridTableWith, + readWith, + readWithM, + testStringWith, + guardEnabled, + guardDisabled, + updateLastStrPos, + notAfterString, + logMessage, + reportLogMessages, + ParserState (..), + HasReaderOptions (..), + HasHeaderMap (..), + HasIdentifierList (..), + HasMacros (..), + HasLogMessages (..), + HasLastStrPosition (..), + defaultParserState, + HeaderType (..), + ParserContext (..), + QuoteContext (..), + HasQuoteContext (..), + NoteTable, + NoteTable', + KeyTable, + SubstTable, + Key (..), + toKey, + registerHeader, + smartPunctuation, + singleQuoteStart, + singleQuoteEnd, + doubleQuoteStart, + doubleQuoteEnd, + ellipses, + apostrophe, + dash, + nested, + citeKey, + macro, + applyMacros', + Parser, + ParserT, + F(..), + runF, + askF, + asksF, + token, + (<+?>), + extractIdClass, + insertIncludedFile, + -- * Re-exports from Text.Pandoc.Parsec + Stream, + runParser, + runParserT, + parse, + anyToken, + getInput, + setInput, + unexpected, + char, + letter, + digit, + alphaNum, + skipMany, + skipMany1, + spaces, + space, + anyChar, + satisfy, + newline, + string, + count, + eof, + noneOf, + oneOf, + lookAhead, + notFollowedBy, + many, + many1, + manyTill, + (<|>), + (<?>), + choice, + try, + sepBy, + sepBy1, + sepEndBy, + sepEndBy1, + endBy, + endBy1, + option, + optional, + optionMaybe, + getState, + setState, + updateState, + SourcePos, + getPosition, + setPosition, + sourceColumn, + sourceLine, + setSourceColumn, + setSourceLine, + newPos + ) +where + +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Builder (Blocks, Inlines, rawBlock, HasMeta(..)) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.XML (fromEntities) +import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) +import Text.Parsec hiding (token) +import Text.Parsec.Pos (newPos) +import Data.Char ( toLower, toUpper, ord, chr, isAscii, isAlphaNum, + isHexDigit, isSpace, isPunctuation ) +import Data.List ( intercalate, transpose, isSuffixOf ) +import Text.Pandoc.Shared +import qualified Data.Map as M +import Text.TeXMath.Readers.TeX.Macros (applyMacros, Macro, + parseMacroDefinitions) +import Text.HTML.TagSoup.Entity ( lookupEntity ) +import Text.Pandoc.Asciify (toAsciiChar) +import Data.Monoid ((<>)) +import Text.Pandoc.Class (PandocMonad, readFileFromDirs, report) +import Text.Pandoc.Logging +import Data.Default +import qualified Data.Set as Set +import Control.Monad.Reader +import Control.Monad.Identity +import Data.Maybe (catMaybes) + +import Text.Pandoc.Error +import Control.Monad.Except + +type Parser t s = Parsec t s + +type ParserT = ParsecT + +newtype F a = F { unF :: Reader ParserState a } deriving (Monad, Applicative, Functor) + +runF :: F a -> ParserState -> a +runF = runReader . unF + +askF :: F ParserState +askF = F ask + +asksF :: (ParserState -> a) -> F a +asksF f = F $ asks f + +instance Monoid a => Monoid (F a) where + mempty = return mempty + mappend = liftM2 mappend + mconcat = liftM mconcat . sequence + +-- | Parse any line of text +anyLine :: Stream [Char] m Char => ParserT [Char] st m [Char] +anyLine = do + -- This is much faster than: + -- manyTill anyChar newline + inp <- getInput + pos <- getPosition + case break (=='\n') inp of + (this, '\n':rest) -> do + -- needed to persuade parsec that this won't match an empty string: + anyChar + setInput rest + setPosition $ incSourceLine (setSourceColumn pos 1) 1 + return this + _ -> mzero + +-- | Like @manyTill@, but reads at least one item. +many1Till :: Stream s m t + => ParserT s st m a + -> ParserT s st m end + -> ParserT s st m [a] +many1Till p end = do + first <- p + rest <- manyTill p end + return (first:rest) + +-- | A more general form of @notFollowedBy@. This one allows any +-- type of parser to be specified, and succeeds only if that parser fails. +-- It does not consume any input. +notFollowedBy' :: (Show b, Stream s m a) => ParserT s st m b -> ParserT s st m () +notFollowedBy' p = try $ join $ do a <- try p + return (unexpected (show a)) + <|> + return (return ()) +-- (This version due to Andrew Pimlott on the Haskell mailing list.) + +oneOfStrings' :: Stream s m Char => (Char -> Char -> Bool) -> [String] -> ParserT s st m String +oneOfStrings' _ [] = fail "no strings" +oneOfStrings' matches strs = try $ do + c <- anyChar + let strs' = [xs | (x:xs) <- strs, x `matches` c] + case strs' of + [] -> fail "not found" + _ -> (c:) <$> oneOfStrings' matches strs' + <|> if "" `elem` strs' + then return [c] + else fail "not found" + +-- | Parses one of a list of strings. If the list contains +-- two strings one of which is a prefix of the other, the longer +-- string will be matched if possible. +oneOfStrings :: Stream s m Char => [String] -> ParserT s st m String +oneOfStrings = oneOfStrings' (==) + +-- | Parses one of a list of strings (tried in order), case insensitive. +oneOfStringsCI :: Stream s m Char => [String] -> ParserT s st m String +oneOfStringsCI = oneOfStrings' ciMatch + where ciMatch x y = toLower' x == toLower' y + -- this optimizes toLower by checking common ASCII case + -- first, before calling the expensive unicode-aware + -- function: + toLower' c | c >= 'A' && c <= 'Z' = chr (ord c + 32) + | isAscii c = c + | otherwise = toLower c + +-- | Parses a space or tab. +spaceChar :: Stream s m Char => ParserT s st m Char +spaceChar = satisfy $ \c -> c == ' ' || c == '\t' + +-- | Parses a nonspace, nonnewline character. +nonspaceChar :: Stream s m Char => ParserT s st m Char +nonspaceChar = satisfy $ flip notElem ['\t', '\n', ' ', '\r'] + +-- | Skips zero or more spaces or tabs. +skipSpaces :: Stream s m Char => ParserT s st m () +skipSpaces = skipMany spaceChar + +-- | Skips zero or more spaces or tabs, then reads a newline. +blankline :: Stream s m Char => ParserT s st m Char +blankline = try $ skipSpaces >> newline + +-- | Parses one or more blank lines and returns a string of newlines. +blanklines :: Stream s m Char => ParserT s st m [Char] +blanklines = many1 blankline + +-- | Parses material enclosed between start and end parsers. +enclosed :: Stream s m Char => ParserT s st m t -- ^ start parser + -> ParserT s st m end -- ^ end parser + -> ParserT s st m a -- ^ content parser (to be used repeatedly) + -> ParserT s st m [a] +enclosed start end parser = try $ + start >> notFollowedBy space >> many1Till parser end + +-- | Parse string, case insensitive. +stringAnyCase :: Stream s m Char => [Char] -> ParserT s st m String +stringAnyCase [] = string "" +stringAnyCase (x:xs) = do + firstChar <- char (toUpper x) <|> char (toLower x) + rest <- stringAnyCase xs + return (firstChar:rest) + +-- | Parse contents of 'str' using 'parser' and return result. +parseFromString :: Monad m => ParserT String st m a -> String -> ParserT String st m a +parseFromString parser str = do + oldPos <- getPosition + oldInput <- getInput + setInput str + result <- parser + spaces + eof + setInput oldInput + setPosition oldPos + return result + +-- | Parse raw line block up to and including blank lines. +lineClump :: Stream [Char] m Char => ParserT [Char] st m String +lineClump = blanklines + <|> (many1 (notFollowedBy blankline >> anyLine) >>= return . unlines) + +-- | Parse a string of characters between an open character +-- and a close character, including text between balanced +-- pairs of open and close, which must be different. For example, +-- @charsInBalanced '(' ')' anyChar@ will parse "(hello (there))" +-- and return "hello (there)". +charsInBalanced :: Stream s m Char => Char -> Char -> ParserT s st m Char + -> ParserT s st m String +charsInBalanced open close parser = try $ do + char open + let isDelim c = c == open || c == close + raw <- many $ many1 (notFollowedBy (satisfy isDelim) >> parser) + <|> (do res <- charsInBalanced open close parser + return $ [open] ++ res ++ [close]) + char close + return $ concat raw + +-- old charsInBalanced would be: +-- charsInBalanced open close (noneOf "\n" <|> char '\n' >> notFollowedBy blankline) +-- old charsInBalanced' would be: +-- charsInBalanced open close anyChar + +-- Auxiliary functions for romanNumeral: + +lowercaseRomanDigits :: [Char] +lowercaseRomanDigits = ['i','v','x','l','c','d','m'] + +uppercaseRomanDigits :: [Char] +uppercaseRomanDigits = map toUpper lowercaseRomanDigits + +-- | Parses a roman numeral (uppercase or lowercase), returns number. +romanNumeral :: Stream s m Char => Bool -- ^ Uppercase if true + -> ParserT s st m Int +romanNumeral upperCase = do + let romanDigits = if upperCase + then uppercaseRomanDigits + else lowercaseRomanDigits + lookAhead $ oneOf romanDigits + let [one, five, ten, fifty, hundred, fivehundred, thousand] = + map char romanDigits + thousands <- many thousand >>= (return . (1000 *) . length) + ninehundreds <- option 0 $ try $ hundred >> thousand >> return 900 + fivehundreds <- many fivehundred >>= (return . (500 *) . length) + fourhundreds <- option 0 $ try $ hundred >> fivehundred >> return 400 + hundreds <- many hundred >>= (return . (100 *) . length) + nineties <- option 0 $ try $ ten >> hundred >> return 90 + fifties <- many fifty >>= (return . (50 *) . length) + forties <- option 0 $ try $ ten >> fifty >> return 40 + tens <- many ten >>= (return . (10 *) . length) + nines <- option 0 $ try $ one >> ten >> return 9 + fives <- many five >>= (return . (5 *) . length) + fours <- option 0 $ try $ one >> five >> return 4 + ones <- many one >>= (return . length) + let total = thousands + ninehundreds + fivehundreds + fourhundreds + + hundreds + nineties + fifties + forties + tens + nines + + fives + fours + ones + if total == 0 + then fail "not a roman numeral" + else return total + +-- Parsers for email addresses and URIs + +-- | Parses an email address; returns original and corresponding +-- escaped mailto: URI. +emailAddress :: Stream s m Char => ParserT s st m (String, String) +emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain) + where toResult mbox dom = let full = fromEntities $ mbox ++ '@':dom + in (full, escapeURI $ "mailto:" ++ full) + mailbox = intercalate "." <$> (emailWord `sepby1` dot) + domain = intercalate "." <$> (subdomain `sepby1` dot) + dot = char '.' + subdomain = many1 $ alphaNum <|> innerPunct + -- this excludes some valid email addresses, since an + -- email could contain e.g. '__', but gives better results + -- for our purposes, when combined with markdown parsing: + innerPunct = try (satisfy (\c -> isEmailPunct c || c == '@') + <* notFollowedBy space + <* notFollowedBy (satisfy isPunctuation)) + -- technically an email address could begin with a symbol, + -- but allowing this creates too many problems. + -- See e.g. https://github.com/jgm/pandoc/issues/2940 + emailWord = do x <- satisfy isAlphaNum + xs <- many (satisfy isEmailChar) + return (x:xs) + isEmailChar c = isAlphaNum c || isEmailPunct c + isEmailPunct c = c `elem` "!\"#$%&'*+-/=?^_{|}~;" + -- note: sepBy1 from parsec consumes input when sep + -- succeeds and p fails, so we use this variant here. + sepby1 p sep = (:) <$> p <*> (many (try $ sep >> p)) + + +-- Schemes from http://www.iana.org/assignments/uri-schemes.html plus +-- the unofficial schemes coap, doi, javascript, isbn, pmid +schemes :: [String] +schemes = ["coap","doi","javascript","aaa","aaas","about","acap","cap","cid", + "crid","data","dav","dict","dns","file","ftp","geo","go","gopher", + "h323","http","https","iax","icap","im","imap","info","ipp","iris", + "iris.beep","iris.xpc","iris.xpcs","iris.lwz","ldap","mailto","mid", + "msrp","msrps","mtqp","mupdate","news","nfs","ni","nih","nntp", + "opaquelocktoken","pop","pres","rtsp","service","session","shttp","sieve", + "sip","sips","sms","snmp","soap.beep","soap.beeps","tag","tel","telnet", + "tftp","thismessage","tn3270","tip","tv","urn","vemmi","ws","wss","xcon", + "xcon-userid","xmlrpc.beep","xmlrpc.beeps","xmpp","z39.50r","z39.50s", + "adiumxtra","afp","afs","aim","apt","attachment","aw","beshare","bitcoin", + "bolo","callto","chrome","chrome-extension","com-eventbrite-attendee", + "content", "cvs","dlna-playsingle","dlna-playcontainer","dtn","dvb", + "ed2k","facetime","feed","finger","fish","gg","git","gizmoproject", + "gtalk","hcp","icon","ipn","irc","irc6","ircs","itms","jar","jms", + "keyparc","lastfm","ldaps","magnet","maps","market","message","mms", + "ms-help","msnim","mumble","mvn","notes","oid","palm","paparazzi", + "platform","proxy","psyc","query","res","resource","rmi","rsync", + "rtmp","secondlife","sftp","sgn","skype","smb","soldat","spotify", + "ssh","steam","svn","teamspeak","things","udp","unreal","ut2004", + "ventrilo","view-source","webcal","wtai","wyciwyg","xfire","xri", + "ymsgr", "isbn", "pmid"] + +uriScheme :: Stream s m Char => ParserT s st m String +uriScheme = oneOfStringsCI schemes + +-- | Parses a URI. Returns pair of original and URI-escaped version. +uri :: Stream [Char] m Char => ParserT [Char] st m (String, String) +uri = try $ do + scheme <- uriScheme + char ':' + -- We allow sentence punctuation except at the end, since + -- we don't want the trailing '.' in 'http://google.com.' We want to allow + -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation) + -- as a URL, while NOT picking up the closing paren in + -- (http://wikipedia.org). So we include balanced parens in the URL. + let isWordChar c = isAlphaNum c || c `elem` "#$%*+/@\\_-" + let wordChar = satisfy isWordChar + let percentEscaped = try $ char '%' >> skipMany1 (satisfy isHexDigit) + let entity = () <$ characterReference + let punct = skipMany1 (char ',') + <|> () <$ (satisfy (\c -> not (isSpace c) && c /= '<' && c /= '>')) + let uriChunk = skipMany1 wordChar + <|> percentEscaped + <|> entity + <|> (try $ punct >> + lookAhead (void (satisfy isWordChar) <|> percentEscaped)) + str <- snd <$> withRaw (skipMany1 ( () <$ + (enclosed (char '(') (char ')') uriChunk + <|> enclosed (char '{') (char '}') uriChunk + <|> enclosed (char '[') (char ']') uriChunk) + <|> uriChunk)) + str' <- option str $ char '/' >> return (str ++ "/") + let uri' = scheme ++ ":" ++ fromEntities str' + return (uri', escapeURI uri') + +mathInlineWith :: Stream s m Char => String -> String -> ParserT s st m String +mathInlineWith op cl = try $ do + string op + notFollowedBy space + words' <- many1Till (count 1 (noneOf " \t\n\\") + <|> (char '\\' >> + -- This next clause is needed because \text{..} can + -- contain $, \(\), etc. + (try (string "text" >> + (("\\text" ++) <$> inBalancedBraces 0 "")) + <|> (\c -> ['\\',c]) <$> anyChar)) + <|> do (blankline <* notFollowedBy' blankline) <|> + (oneOf " \t" <* skipMany (oneOf " \t")) + notFollowedBy (char '$') + return " " + ) (try $ string cl) + notFollowedBy digit -- to prevent capture of $5 + return $ concat words' + where + inBalancedBraces :: Stream s m Char => Int -> String -> ParserT s st m String + inBalancedBraces 0 "" = do + c <- anyChar + if c == '{' + then inBalancedBraces 1 "{" + else mzero + inBalancedBraces 0 s = return $ reverse s + inBalancedBraces numOpen ('\\':xs) = do + c <- anyChar + inBalancedBraces numOpen (c:'\\':xs) + inBalancedBraces numOpen xs = do + c <- anyChar + case c of + '}' -> inBalancedBraces (numOpen - 1) (c:xs) + '{' -> inBalancedBraces (numOpen + 1) (c:xs) + _ -> inBalancedBraces numOpen (c:xs) + +mathDisplayWith :: Stream s m Char => String -> String -> ParserT s st m String +mathDisplayWith op cl = try $ do + string op + many1Till (noneOf "\n" <|> (newline <* notFollowedBy' blankline)) (try $ string cl) + +mathDisplay :: (HasReaderOptions st, Stream s m Char) + => ParserT s st m String +mathDisplay = + (guardEnabled Ext_tex_math_dollars >> mathDisplayWith "$$" "$$") + <|> (guardEnabled Ext_tex_math_single_backslash >> + mathDisplayWith "\\[" "\\]") + <|> (guardEnabled Ext_tex_math_double_backslash >> + mathDisplayWith "\\\\[" "\\\\]") + +mathInline :: (HasReaderOptions st , Stream s m Char) + => ParserT s st m String +mathInline = + (guardEnabled Ext_tex_math_dollars >> mathInlineWith "$" "$") + <|> (guardEnabled Ext_tex_math_single_backslash >> + mathInlineWith "\\(" "\\)") + <|> (guardEnabled Ext_tex_math_double_backslash >> + mathInlineWith "\\\\(" "\\\\)") + +-- | Applies a parser, returns tuple of its results and its horizontal +-- displacement (the difference between the source column at the end +-- and the source column at the beginning). Vertical displacement +-- (source row) is ignored. +withHorizDisplacement :: Stream s m Char + => ParserT s st m a -- ^ Parser to apply + -> ParserT s st m (a, Int) -- ^ (result, displacement) +withHorizDisplacement parser = do + pos1 <- getPosition + result <- parser + pos2 <- getPosition + return (result, sourceColumn pos2 - sourceColumn pos1) + +-- | Applies a parser and returns the raw string that was parsed, +-- along with the value produced by the parser. +withRaw :: Stream [Char] m Char => ParsecT [Char] st m a -> ParsecT [Char] st m (a, [Char]) +withRaw parser = do + pos1 <- getPosition + inp <- getInput + result <- parser + pos2 <- getPosition + let (l1,c1) = (sourceLine pos1, sourceColumn pos1) + let (l2,c2) = (sourceLine pos2, sourceColumn pos2) + let inplines = take ((l2 - l1) + 1) $ lines inp + let raw = case inplines of + [] -> "" + [l] -> take (c2 - c1) l + ls -> unlines (init ls) ++ take (c2 - 1) (last ls) + return (result, raw) + +-- | Parses backslash, then applies character parser. +escaped :: Stream s m Char + => ParserT s st m Char -- ^ Parser for character to escape + -> ParserT s st m Char +escaped parser = try $ char '\\' >> parser + +-- | Parse character entity. +characterReference :: Stream s m Char => ParserT s st m Char +characterReference = try $ do + char '&' + ent <- many1Till nonspaceChar (char ';') + let ent' = case ent of + '#':'X':xs -> '#':'x':xs -- workaround tagsoup bug + '#':_ -> ent + _ -> ent ++ ";" + case lookupEntity ent' of + Just (c : _) -> return c + _ -> fail "entity not found" + +-- | Parses an uppercase roman numeral and returns (UpperRoman, number). +upperRoman :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +upperRoman = do + num <- romanNumeral True + return (UpperRoman, num) + +-- | Parses a lowercase roman numeral and returns (LowerRoman, number). +lowerRoman :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +lowerRoman = do + num <- romanNumeral False + return (LowerRoman, num) + +-- | Parses a decimal numeral and returns (Decimal, number). +decimal :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +decimal = do + num <- many1 digit + return (Decimal, read num) + +-- | Parses a '@' and optional label and +-- returns (DefaultStyle, [next example number]). The next +-- example number is incremented in parser state, and the label +-- (if present) is added to the label table. +exampleNum :: Stream s m Char + => ParserT s ParserState m (ListNumberStyle, Int) +exampleNum = do + char '@' + lab <- many (alphaNum <|> satisfy (\c -> c == '_' || c == '-')) + st <- getState + let num = stateNextExample st + let newlabels = if null lab + then stateExamples st + else M.insert lab num $ stateExamples st + updateState $ \s -> s{ stateNextExample = num + 1 + , stateExamples = newlabels } + return (Example, num) + +-- | Parses a '#' returns (DefaultStyle, 1). +defaultNum :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +defaultNum = do + char '#' + return (DefaultStyle, 1) + +-- | Parses a lowercase letter and returns (LowerAlpha, number). +lowerAlpha :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +lowerAlpha = do + ch <- oneOf ['a'..'z'] + return (LowerAlpha, ord ch - ord 'a' + 1) + +-- | Parses an uppercase letter and returns (UpperAlpha, number). +upperAlpha :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +upperAlpha = do + ch <- oneOf ['A'..'Z'] + return (UpperAlpha, ord ch - ord 'A' + 1) + +-- | Parses a roman numeral i or I +romanOne :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +romanOne = (char 'i' >> return (LowerRoman, 1)) <|> + (char 'I' >> return (UpperRoman, 1)) + +-- | Parses an ordered list marker and returns list attributes. +anyOrderedListMarker :: Stream s m Char => ParserT s ParserState m ListAttributes +anyOrderedListMarker = choice $ + [delimParser numParser | delimParser <- [inPeriod, inOneParen, inTwoParens], + numParser <- [decimal, exampleNum, defaultNum, romanOne, + lowerAlpha, lowerRoman, upperAlpha, upperRoman]] + +-- | Parses a list number (num) followed by a period, returns list attributes. +inPeriod :: Stream s m Char + => ParserT s st m (ListNumberStyle, Int) + -> ParserT s st m ListAttributes +inPeriod num = try $ do + (style, start) <- num + char '.' + let delim = if style == DefaultStyle + then DefaultDelim + else Period + return (start, style, delim) + +-- | Parses a list number (num) followed by a paren, returns list attributes. +inOneParen :: Stream s m Char + => ParserT s st m (ListNumberStyle, Int) + -> ParserT s st m ListAttributes +inOneParen num = try $ do + (style, start) <- num + char ')' + return (start, style, OneParen) + +-- | Parses a list number (num) enclosed in parens, returns list attributes. +inTwoParens :: Stream s m Char + => ParserT s st m (ListNumberStyle, Int) + -> ParserT s st m ListAttributes +inTwoParens num = try $ do + char '(' + (style, start) <- num + char ')' + return (start, style, TwoParens) + +-- | Parses an ordered list marker with a given style and delimiter, +-- returns number. +orderedListMarker :: Stream s m Char + => ListNumberStyle + -> ListNumberDelim + -> ParserT s ParserState m Int +orderedListMarker style delim = do + let num = defaultNum <|> -- # can continue any kind of list + case style of + DefaultStyle -> decimal + Example -> exampleNum + Decimal -> decimal + UpperRoman -> upperRoman + LowerRoman -> lowerRoman + UpperAlpha -> upperAlpha + LowerAlpha -> lowerAlpha + let context = case delim of + DefaultDelim -> inPeriod + Period -> inPeriod + OneParen -> inOneParen + TwoParens -> inTwoParens + (start, _, _) <- context num + return start + +-- | Parses a character reference and returns a Str element. +charRef :: Stream s m Char => ParserT s st m Inline +charRef = do + c <- characterReference + return $ Str [c] + +lineBlockLine :: Stream [Char] m Char => ParserT [Char] st m String +lineBlockLine = try $ do + char '|' + char ' ' + white <- many (spaceChar >> return '\160') + notFollowedBy newline + line <- anyLine + continuations <- many (try $ char ' ' >> anyLine) + return $ white ++ unwords (line : continuations) + +blankLineBlockLine :: Stream [Char] m Char => ParserT [Char] st m Char +blankLineBlockLine = try (char '|' >> blankline) + +-- | Parses an RST-style line block and returns a list of strings. +lineBlockLines :: Stream [Char] m Char => ParserT [Char] st m [String] +lineBlockLines = try $ do + lines' <- many1 (lineBlockLine <|> ((:[]) <$> blankLineBlockLine)) + skipMany1 $ blankline <|> blankLineBlockLine + return lines' + +-- | Parse a table using 'headerParser', 'rowParser', +-- 'lineParser', and 'footerParser'. +tableWith :: Stream s m Char + => ParserT s ParserState m ([Blocks], [Alignment], [Int]) + -> ([Int] -> ParserT s ParserState m [Blocks]) + -> ParserT s ParserState m sep + -> ParserT s ParserState m end + -> ParserT s ParserState m Blocks +tableWith headerParser rowParser lineParser footerParser = try $ do + (heads, aligns, indices) <- headerParser + lines' <- rowParser indices `sepEndBy1` lineParser + footerParser + numColumns <- getOption readerColumns + let widths = if (indices == []) + then replicate (length aligns) 0.0 + else widthsFromIndices numColumns indices + return $ B.table mempty (zip aligns widths) heads lines' + +-- Calculate relative widths of table columns, based on indices +widthsFromIndices :: Int -- Number of columns on terminal + -> [Int] -- Indices + -> [Double] -- Fractional relative sizes of columns +widthsFromIndices _ [] = [] +widthsFromIndices numColumns' indices = + let numColumns = max numColumns' (if null indices then 0 else last indices) + lengths' = zipWith (-) indices (0:indices) + lengths = reverse $ + case reverse lengths' of + [] -> [] + [x] -> [x] + -- compensate for the fact that intercolumn + -- spaces are counted in widths of all columns + -- but the last... + (x:y:zs) -> if x < y && y - x <= 2 + then y:y:zs + else x:y:zs + totLength = sum lengths + quotient = if totLength > numColumns + then fromIntegral totLength + else fromIntegral numColumns + fracs = map (\l -> (fromIntegral l) / quotient) lengths in + tail fracs + +--- + +-- Parse a grid table: starts with row of '-' on top, then header +-- (which may be grid), then the rows, +-- which may be grid, separated by blank lines, and +-- ending with a footer (dashed line followed by blank line). +gridTableWith :: Stream [Char] m Char + => ParserT [Char] ParserState m Blocks -- ^ Block list parser + -> Bool -- ^ Headerless table + -> ParserT [Char] ParserState m Blocks +gridTableWith blocks headless = + tableWith (gridTableHeader headless blocks) (gridTableRow blocks) + (gridTableSep '-') gridTableFooter + +gridTableSplitLine :: [Int] -> String -> [String] +gridTableSplitLine indices line = map removeFinalBar $ tail $ + splitStringByIndices (init indices) $ trimr line + +gridPart :: Stream s m Char => Char -> ParserT s st m (Int, Int) +gridPart ch = do + dashes <- many1 (char ch) + char '+' + return (length dashes, length dashes + 1) + +gridDashedLines :: Stream s m Char => Char -> ParserT s st m [(Int,Int)] +gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline + +removeFinalBar :: String -> String +removeFinalBar = + reverse . dropWhile (`elem` " \t") . dropWhile (=='|') . reverse + +-- | Separator between rows of grid table. +gridTableSep :: Stream s m Char => Char -> ParserT s ParserState m Char +gridTableSep ch = try $ gridDashedLines ch >> return '\n' + +-- | Parse header for a grid table. +gridTableHeader :: Stream [Char] m Char + => Bool -- ^ Headerless table + -> ParserT [Char] ParserState m Blocks + -> ParserT [Char] ParserState m ([Blocks], [Alignment], [Int]) +gridTableHeader headless blocks = try $ do + optional blanklines + dashes <- gridDashedLines '-' + rawContent <- if headless + then return $ repeat "" + else many1 + (notFollowedBy (gridTableSep '=') >> char '|' >> + many1Till anyChar newline) + if headless + then return () + else gridTableSep '=' >> return () + let lines' = map snd dashes + let indices = scanl (+) 0 lines' + let aligns = replicate (length lines') AlignDefault + -- RST does not have a notion of alignments + let rawHeads = if headless + then replicate (length dashes) "" + else map (intercalate " ") $ transpose + $ map (gridTableSplitLine indices) rawContent + heads <- mapM (parseFromString blocks) $ map trim rawHeads + return (heads, aligns, indices) + +gridTableRawLine :: Stream s m Char => [Int] -> ParserT s ParserState m [String] +gridTableRawLine indices = do + char '|' + line <- many1Till anyChar newline + return (gridTableSplitLine indices line) + +-- | Parse row of grid table. +gridTableRow :: Stream [Char] m Char + => ParserT [Char] ParserState m Blocks + -> [Int] + -> ParserT [Char] ParserState m [Blocks] +gridTableRow blocks indices = do + colLines <- many1 (gridTableRawLine indices) + let cols = map ((++ "\n") . unlines . removeOneLeadingSpace) $ + transpose colLines + mapM (liftM compactifyCell . parseFromString blocks) cols + +removeOneLeadingSpace :: [String] -> [String] +removeOneLeadingSpace xs = + if all startsWithSpace xs + then map (drop 1) xs + else xs + where startsWithSpace "" = True + startsWithSpace (y:_) = y == ' ' + +compactifyCell :: Blocks -> Blocks +compactifyCell bs = head $ compactify [bs] + +-- | Parse footer for a grid table. +gridTableFooter :: Stream s m Char => ParserT s ParserState m [Char] +gridTableFooter = blanklines + +--- + +-- | Removes the ParsecT layer from the monad transformer stack +readWithM :: (Monad m) + => ParserT [Char] st m a -- ^ parser + -> st -- ^ initial state + -> String -- ^ input + -> m (Either PandocError a) +readWithM parser state input = + mapLeft (PandocParsecError input) `liftM` runParserT parser state "source" input + + +-- | Parse a string with a given parser and state +readWith :: Parser [Char] st a + -> st + -> String + -> Either PandocError a +readWith p t inp = runIdentity $ readWithM p t inp + +-- | Parse a string with @parser@ (for testing). +testStringWith :: (Show a) + => ParserT [Char] ParserState Identity a + -> [Char] + -> IO () +testStringWith parser str = UTF8.putStrLn $ show $ + readWith parser defaultParserState str + +-- | Parsing options. +data ParserState = ParserState + { stateOptions :: ReaderOptions, -- ^ User options + stateParserContext :: ParserContext, -- ^ Inside list? + stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment? + stateAllowLinks :: Bool, -- ^ Allow parsing of links + stateMaxNestingLevel :: Int, -- ^ Max # of nested Strong/Emph + stateLastStrPos :: Maybe SourcePos, -- ^ Position after last str parsed + stateKeys :: KeyTable, -- ^ List of reference keys + stateHeaderKeys :: KeyTable, -- ^ List of implicit header ref keys + stateSubstitutions :: SubstTable, -- ^ List of substitution references + stateNotes :: NoteTable, -- ^ List of notes (raw bodies) + stateNotes' :: NoteTable', -- ^ List of notes (parsed bodies) + stateMeta :: Meta, -- ^ Document metadata + stateMeta' :: F Meta, -- ^ Document metadata + stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used + stateHeaders :: M.Map Inlines String, -- ^ List of headers and ids (used for implicit ref links) + stateIdentifiers :: Set.Set String, -- ^ Header identifiers used + stateNextExample :: Int, -- ^ Number of next example + stateExamples :: M.Map String Int, -- ^ Map from example labels to numbers + stateHasChapters :: Bool, -- ^ True if \chapter encountered + stateMacros :: [Macro], -- ^ List of macros defined so far + stateRstDefaultRole :: String, -- ^ Current rST default interpreted text role + stateRstCustomRoles :: M.Map String (String, Maybe String, Attr), -- ^ Current rST custom text roles + -- Triple represents: 1) Base role, 2) Optional format (only for :raw: + -- roles), 3) Additional classes (rest of Attr is unused)). + stateCaption :: Maybe Inlines, -- ^ Caption in current environment + stateInHtmlBlock :: Maybe String, -- ^ Tag type of HTML block being parsed + stateContainers :: [String], -- ^ parent include files + stateLogMessages :: [LogMessage], -- ^ log messages + stateMarkdownAttribute :: Bool -- ^ True if in markdown=1 context + } + +instance Default ParserState where + def = defaultParserState + +instance HasMeta ParserState where + setMeta field val st = + st{ stateMeta = setMeta field val $ stateMeta st } + deleteMeta field st = + st{ stateMeta = deleteMeta field $ stateMeta st } + +class HasReaderOptions st where + extractReaderOptions :: st -> ReaderOptions + getOption :: (Stream s m t) => (ReaderOptions -> b) -> ParserT s st m b + -- default + getOption f = (f . extractReaderOptions) <$> getState + +class HasQuoteContext st m where + getQuoteContext :: (Stream s m t) => ParsecT s st m QuoteContext + withQuoteContext :: QuoteContext -> ParsecT s st m a -> ParsecT s st m a + +instance Monad m => HasQuoteContext ParserState m where + getQuoteContext = stateQuoteContext <$> getState + withQuoteContext context parser = do + oldState <- getState + let oldQuoteContext = stateQuoteContext oldState + setState oldState { stateQuoteContext = context } + result <- parser + newState <- getState + setState newState { stateQuoteContext = oldQuoteContext } + return result + +instance HasReaderOptions ParserState where + extractReaderOptions = stateOptions + +class HasHeaderMap st where + extractHeaderMap :: st -> M.Map Inlines String + updateHeaderMap :: (M.Map Inlines String -> M.Map Inlines String) -> + st -> st + +instance HasHeaderMap ParserState where + extractHeaderMap = stateHeaders + updateHeaderMap f st = st{ stateHeaders = f $ stateHeaders st } + +class HasIdentifierList st where + extractIdentifierList :: st -> Set.Set String + updateIdentifierList :: (Set.Set String -> Set.Set String) -> st -> st + +instance HasIdentifierList ParserState where + extractIdentifierList = stateIdentifiers + updateIdentifierList f st = st{ stateIdentifiers = f $ stateIdentifiers st } + +class HasMacros st where + extractMacros :: st -> [Macro] + updateMacros :: ([Macro] -> [Macro]) -> st -> st + +instance HasMacros ParserState where + extractMacros = stateMacros + updateMacros f st = st{ stateMacros = f $ stateMacros st } + +class HasLastStrPosition st where + setLastStrPos :: SourcePos -> st -> st + getLastStrPos :: st -> Maybe SourcePos + +instance HasLastStrPosition ParserState where + setLastStrPos pos st = st{ stateLastStrPos = Just pos } + getLastStrPos st = stateLastStrPos st + +class HasLogMessages st where + addLogMessage :: LogMessage -> st -> st + getLogMessages :: st -> [LogMessage] + +instance HasLogMessages ParserState where + addLogMessage msg st = st{ stateLogMessages = msg : stateLogMessages st } + getLogMessages st = reverse $ stateLogMessages st + +defaultParserState :: ParserState +defaultParserState = + ParserState { stateOptions = def, + stateParserContext = NullState, + stateQuoteContext = NoQuote, + stateAllowLinks = True, + stateMaxNestingLevel = 6, + stateLastStrPos = Nothing, + stateKeys = M.empty, + stateHeaderKeys = M.empty, + stateSubstitutions = M.empty, + stateNotes = [], + stateNotes' = [], + stateMeta = nullMeta, + stateMeta' = return nullMeta, + stateHeaderTable = [], + stateHeaders = M.empty, + stateIdentifiers = Set.empty, + stateNextExample = 1, + stateExamples = M.empty, + stateHasChapters = False, + stateMacros = [], + stateRstDefaultRole = "title-reference", + stateRstCustomRoles = M.empty, + stateCaption = Nothing, + stateInHtmlBlock = Nothing, + stateContainers = [], + stateLogMessages = [], + stateMarkdownAttribute = False + } + +-- | Add a log message. +logMessage :: (Stream s m a, HasLogMessages st) + => LogMessage -> ParserT s st m () +logMessage msg = updateState (addLogMessage msg) + +-- | Report all the accumulated log messages, according to verbosity level. +reportLogMessages :: (PandocMonad m, HasLogMessages st) => ParserT s st m () +reportLogMessages = do + msgs <- getLogMessages <$> getState + mapM_ report msgs + +-- | Succeed only if the extension is enabled. +guardEnabled :: (Stream s m a, HasReaderOptions st) => Extension -> ParserT s st m () +guardEnabled ext = getOption readerExtensions >>= guard . extensionEnabled ext + +-- | Succeed only if the extension is disabled. +guardDisabled :: (Stream s m a, HasReaderOptions st) => Extension -> ParserT s st m () +guardDisabled ext = getOption readerExtensions >>= guard . not . extensionEnabled ext + +-- | Update the position on which the last string ended. +updateLastStrPos :: (Stream s m a, HasLastStrPosition st) => ParserT s st m () +updateLastStrPos = getPosition >>= updateState . setLastStrPos + +-- | Whether we are right after the end of a string. +notAfterString :: (Stream s m a, HasLastStrPosition st) => ParserT s st m Bool +notAfterString = do + pos <- getPosition + st <- getState + return $ getLastStrPos st /= Just pos + +data HeaderType + = SingleHeader Char -- ^ Single line of characters underneath + | DoubleHeader Char -- ^ Lines of characters above and below + deriving (Eq, Show) + +data ParserContext + = ListItemState -- ^ Used when running parser on list item contents + | NullState -- ^ Default state + deriving (Eq, Show) + +data QuoteContext + = InSingleQuote -- ^ Used when parsing inside single quotes + | InDoubleQuote -- ^ Used when parsing inside double quotes + | NoQuote -- ^ Used when not parsing inside quotes + deriving (Eq, Show) + +type NoteTable = [(String, String)] + +type NoteTable' = [(String, F Blocks)] -- used in markdown reader + +newtype Key = Key String deriving (Show, Read, Eq, Ord) + +toKey :: String -> Key +toKey = Key . map toLower . unwords . words . unbracket + where unbracket ('[':xs) | "]" `isSuffixOf` xs = take (length xs - 1) xs + unbracket xs = xs + +type KeyTable = M.Map Key (Target, Attr) + +type SubstTable = M.Map Key Inlines + +-- | Add header to the list of headers in state, together +-- with its associated identifier. If the identifier is null +-- and the auto_identifers extension is set, generate a new +-- unique identifier, and update the list of identifiers +-- in state. +registerHeader :: (Stream s m a, HasReaderOptions st, HasHeaderMap st, HasIdentifierList st) + => Attr -> Inlines -> ParserT s st m Attr +registerHeader (ident,classes,kvs) header' = do + ids <- extractIdentifierList <$> getState + exts <- getOption readerExtensions + let insert' = M.insertWith (\_new old -> old) + if null ident && Ext_auto_identifiers `extensionEnabled` exts + then do + let id' = uniqueIdent (B.toList header') ids + let id'' = if Ext_ascii_identifiers `extensionEnabled` exts + then catMaybes $ map toAsciiChar id' + else id' + updateState $ updateIdentifierList $ Set.insert id' + updateState $ updateIdentifierList $ Set.insert id'' + updateState $ updateHeaderMap $ insert' header' id' + return (id'',classes,kvs) + else do + unless (null ident) $ + updateState $ updateHeaderMap $ insert' header' ident + return (ident,classes,kvs) + +smartPunctuation :: (HasReaderOptions st, HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines +smartPunctuation inlineParser = do + guardEnabled Ext_smart + choice [ quoted inlineParser, apostrophe, dash, ellipses ] + +apostrophe :: Stream s m Char => ParserT s st m Inlines +apostrophe = (char '\'' <|> char '\8217') >> return (B.str "\x2019") + +quoted :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines +quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser + +singleQuoted :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines +singleQuoted inlineParser = try $ do + singleQuoteStart + withQuoteContext InSingleQuote $ many1Till inlineParser singleQuoteEnd >>= + return . B.singleQuoted . mconcat + +doubleQuoted :: (HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines +doubleQuoted inlineParser = try $ do + doubleQuoteStart + withQuoteContext InDoubleQuote $ manyTill inlineParser doubleQuoteEnd >>= + return . B.doubleQuoted . mconcat + +failIfInQuoteContext :: (HasQuoteContext st m, Stream s m t) + => QuoteContext + -> ParserT s st m () +failIfInQuoteContext context = do + context' <- getQuoteContext + if context' == context + then fail "already inside quotes" + else return () + +charOrRef :: Stream s m Char => String -> ParserT s st m Char +charOrRef cs = + oneOf cs <|> try (do c <- characterReference + guard (c `elem` cs) + return c) + +singleQuoteStart :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m () +singleQuoteStart = do + failIfInQuoteContext InSingleQuote + -- single quote start can't be right after str + guard =<< notAfterString + () <$ charOrRef "'\8216\145" + +singleQuoteEnd :: Stream s m Char + => ParserT s st m () +singleQuoteEnd = try $ do + charOrRef "'\8217\146" + notFollowedBy alphaNum + +doubleQuoteStart :: (HasQuoteContext st m, Stream s m Char) + => ParserT s st m () +doubleQuoteStart = do + failIfInQuoteContext InDoubleQuote + try $ do charOrRef "\"\8220\147" + notFollowedBy . satisfy $ flip elem [' ', '\t', '\n'] + +doubleQuoteEnd :: Stream s m Char + => ParserT s st m () +doubleQuoteEnd = void (charOrRef "\"\8221\148") + +ellipses :: Stream s m Char + => ParserT s st m Inlines +ellipses = try (string "..." >> return (B.str "\8230")) + +dash :: (HasReaderOptions st, Stream s m Char) + => ParserT s st m Inlines +dash = try $ do + oldDashes <- extensionEnabled Ext_old_dashes <$> getOption readerExtensions + if oldDashes + then do + char '-' + (char '-' >> return (B.str "\8212")) + <|> (lookAhead digit >> return (B.str "\8211")) + else do + string "--" + (char '-' >> return (B.str "\8212")) + <|> return (B.str "\8211") + +-- This is used to prevent exponential blowups for things like: +-- a**a*a**a*a**a*a**a*a**a*a**a*a** +nested :: Stream s m a + => ParserT s ParserState m a + -> ParserT s ParserState m a +nested p = do + nestlevel <- stateMaxNestingLevel <$> getState + guard $ nestlevel > 0 + updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } + res <- p + updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } + return res + +citeKey :: (Stream s m Char, HasLastStrPosition st) + => ParserT s st m (Bool, String) +citeKey = try $ do + guard =<< notAfterString + suppress_author <- option False (char '-' *> return True) + char '@' + firstChar <- alphaNum <|> char '_' <|> char '*' -- @* for wildcard in nocite + let regchar = satisfy (\c -> isAlphaNum c || c == '_') + let internal p = try $ p <* lookAhead regchar + rest <- many $ regchar <|> internal (oneOf ":.#$%&-+?<>~/") <|> + try (oneOf ":/" <* lookAhead (char '/')) + let key = firstChar:rest + return (suppress_author, key) + + +token :: (Stream s m t) + => (t -> String) + -> (t -> SourcePos) + -> (t -> Maybe a) + -> ParsecT s st m a +token pp pos match = tokenPrim pp (\_ t _ -> pos t) match + +-- +-- Macros +-- + +-- | Parse a \newcommand or \renewcommand macro definition. +macro :: (Stream [Char] m Char, HasMacros st, HasReaderOptions st) + => ParserT [Char] st m Blocks +macro = do + apply <- getOption readerApplyMacros + inp <- getInput + case parseMacroDefinitions inp of + ([], _) -> mzero + (ms, rest) -> do def' <- count (length inp - length rest) anyChar + if apply + then do + updateState $ \st -> + updateMacros (ms ++) st + return mempty + else return $ rawBlock "latex" def' + +-- | Apply current macros to string. +applyMacros' :: (HasReaderOptions st, HasMacros st, Stream [Char] m Char) + => String + -> ParserT [Char] st m String +applyMacros' target = do + apply <- getOption readerApplyMacros + if apply + then do macros <- extractMacros <$> getState + return $ applyMacros macros target + else return target + +infixr 5 <+?> +(<+?>) :: (Monoid a) => ParserT s st m a -> ParserT s st m a -> ParserT s st m a +a <+?> b = a >>= flip fmap (try b <|> return mempty) . (<>) + +extractIdClass :: Attr -> Attr +extractIdClass (ident, cls, kvs) = (ident', cls', kvs') + where + ident' = case (lookup "id" kvs) of + Just v -> v + Nothing -> ident + cls' = case (lookup "class" kvs) of + Just cl -> words cl + Nothing -> cls + kvs' = filter (\(k,_) -> k /= "id" || k /= "class") kvs + +insertIncludedFile :: PandocMonad m + => ParserT String ParserState m Blocks + -> [FilePath] -> FilePath + -> ParserT String ParserState m Blocks +insertIncludedFile blocks dirs f = do + oldPos <- getPosition + oldInput <- getInput + containers <- stateContainers <$> getState + when (f `elem` containers) $ + throwError $ PandocParseError $ "Include file loop at " ++ show oldPos + updateState $ \s -> s{ stateContainers = f : stateContainers s } + mbcontents <- readFileFromDirs dirs f + contents <- case mbcontents of + Just s -> return s + Nothing -> do + report $ CouldNotLoadIncludeFile f oldPos + return "" + setPosition $ newPos f 1 1 + setInput contents + bs <- blocks + setInput oldInput + setPosition oldPos + updateState $ \s -> s{ stateContainers = tail $ stateContainers s } + return bs diff --git a/src/Text/Pandoc/Pretty.hs b/src/Text/Pandoc/Pretty.hs new file mode 100644 index 000000000..256f38b0c --- /dev/null +++ b/src/Text/Pandoc/Pretty.hs @@ -0,0 +1,557 @@ +{-# LANGUAGE GeneralizedNewtypeDeriving, CPP #-} +{- +Copyright (C) 2010-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111(-1)307 USA +-} + +{- | + Module : Text.Pandoc.Pretty + Copyright : Copyright (C) 2010-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +A prettyprinting library for the production of text documents, +including wrapped text, indentated blocks, and tables. +-} + +module Text.Pandoc.Pretty ( + Doc + , render + , cr + , blankline + , blanklines + , space + , text + , char + , prefixed + , flush + , nest + , hang + , beforeNonBlank + , nowrap + , afterBreak + , offset + , minOffset + , height + , lblock + , cblock + , rblock + , (<>) + , (<+>) + , ($$) + , ($+$) + , isEmpty + , empty + , cat + , hcat + , hsep + , vcat + , vsep + , nestle + , chomp + , inside + , braces + , brackets + , parens + , quotes + , doubleQuotes + , charWidth + , realLength + ) + +where +import Data.Sequence (Seq, fromList, (<|), singleton, mapWithIndex, viewl, ViewL(..)) +import qualified Data.Sequence as Seq +import Data.Foldable (toList) +import Data.List (intersperse) +import Data.String +import Control.Monad.State +import Data.Char (isSpace) +import Data.Monoid ((<>)) + +data RenderState a = RenderState{ + output :: [a] -- ^ In reverse order + , prefix :: String + , usePrefix :: Bool + , lineLength :: Maybe Int -- ^ 'Nothing' means no wrapping + , column :: Int + , newlines :: Int -- ^ Number of preceding newlines + } + +type DocState a = State (RenderState a) () + +data D = Text Int String + | Block Int [String] + | Prefixed String Doc + | BeforeNonBlank Doc + | Flush Doc + | BreakingSpace + | AfterBreak String + | CarriageReturn + | NewLine + | BlankLines Int -- number of blank lines + deriving (Show) + +newtype Doc = Doc { unDoc :: Seq D } + deriving (Monoid, Show) + +instance IsString Doc where + fromString = text + +isBlank :: D -> Bool +isBlank BreakingSpace = True +isBlank CarriageReturn = True +isBlank NewLine = True +isBlank (BlankLines _) = True +isBlank (Text _ (c:_)) = isSpace c +isBlank _ = False + +-- | True if the document is empty. +isEmpty :: Doc -> Bool +isEmpty = Seq.null . unDoc + +-- | The empty document. +empty :: Doc +empty = mempty + +-- | Concatenate a list of 'Doc's. +cat :: [Doc] -> Doc +cat = mconcat + +-- | Same as 'cat'. +hcat :: [Doc] -> Doc +hcat = mconcat + +-- | Concatenate a list of 'Doc's, putting breakable spaces +-- between them. +infixr 6 <+> +(<+>) :: Doc -> Doc -> Doc +(<+>) x y = if isEmpty x + then y + else if isEmpty y + then x + else x <> space <> y + +-- | Same as 'cat', but putting breakable spaces between the +-- 'Doc's. +hsep :: [Doc] -> Doc +hsep = foldr (<+>) empty + +infixr 5 $$ +-- | @a $$ b@ puts @a@ above @b@. +($$) :: Doc -> Doc -> Doc +($$) x y = if isEmpty x + then y + else if isEmpty y + then x + else x <> cr <> y + +infixr 5 $+$ +-- | @a $+$ b@ puts @a@ above @b@, with a blank line between. +($+$) :: Doc -> Doc -> Doc +($+$) x y = if isEmpty x + then y + else if isEmpty y + then x + else x <> blankline <> y + +-- | List version of '$$'. +vcat :: [Doc] -> Doc +vcat = foldr ($$) empty + +-- | List version of '$+$'. +vsep :: [Doc] -> Doc +vsep = foldr ($+$) empty + +-- | Removes leading blank lines from a 'Doc'. +nestle :: Doc -> Doc +nestle (Doc d) = Doc $ go d + where go x = case viewl x of + (BlankLines _ :< rest) -> go rest + (NewLine :< rest) -> go rest + _ -> x + +-- | Chomps trailing blank space off of a 'Doc'. +chomp :: Doc -> Doc +chomp d = Doc (fromList dl') + where dl = toList (unDoc d) + dl' = reverse $ go $ reverse dl + go [] = [] + go (BreakingSpace : xs) = go xs + go (CarriageReturn : xs) = go xs + go (NewLine : xs) = go xs + go (BlankLines _ : xs) = go xs + go (Prefixed s d' : xs) = Prefixed s (chomp d') : xs + go xs = xs + +outp :: (IsString a) => Int -> String -> DocState a +outp off s | off < 0 = do -- offset < 0 means newline characters + st' <- get + let rawpref = prefix st' + when (column st' == 0 && usePrefix st' && not (null rawpref)) $ do + let pref = reverse $ dropWhile isSpace $ reverse rawpref + modify $ \st -> st{ output = fromString pref : output st + , column = column st + realLength pref } + let numnewlines = length $ takeWhile (=='\n') $ reverse s + modify $ \st -> st { output = fromString s : output st + , column = 0 + , newlines = newlines st + numnewlines } +outp off s = do -- offset >= 0 (0 might be combining char) + st' <- get + let pref = prefix st' + when (column st' == 0 && usePrefix st' && not (null pref)) $ do + modify $ \st -> st{ output = fromString pref : output st + , column = column st + realLength pref } + modify $ \st -> st{ output = fromString s : output st + , column = column st + off + , newlines = 0 } + +-- | Renders a 'Doc'. @render (Just n)@ will use +-- a line length of @n@ to reflow text on breakable spaces. +-- @render Nothing@ will not reflow text. +render :: (IsString a) => Maybe Int -> Doc -> a +render linelen doc = fromString . mconcat . reverse . output $ + execState (renderDoc doc) startingState + where startingState = RenderState{ + output = mempty + , prefix = "" + , usePrefix = True + , lineLength = linelen + , column = 0 + , newlines = 2 } + +renderDoc :: (IsString a, Monoid a) + => Doc -> DocState a +renderDoc = renderList . toList . unDoc + +data IsBlock = IsBlock Int [String] + +-- This would be nicer with a pattern synonym +-- pattern VBlock i s <- mkIsBlock -> Just (IsBlock ..) + +renderList :: (IsString a, Monoid a) + => [D] -> DocState a +renderList [] = return () +renderList (Text off s : xs) = do + outp off s + renderList xs + +renderList (Prefixed pref d : xs) = do + st <- get + let oldPref = prefix st + put st{ prefix = prefix st ++ pref } + renderDoc d + modify $ \s -> s{ prefix = oldPref } + renderList xs + +renderList (Flush d : xs) = do + st <- get + let oldUsePrefix = usePrefix st + put st{ usePrefix = False } + renderDoc d + modify $ \s -> s{ usePrefix = oldUsePrefix } + renderList xs + +renderList (BeforeNonBlank d : xs) = + case xs of + (x:_) | isBlank x -> renderList xs + | otherwise -> renderDoc d >> renderList xs + [] -> renderList xs + +renderList [BlankLines _] = return () + +renderList (BlankLines m : BlankLines n : xs) = + renderList (BlankLines (max m n) : xs) + +renderList (BlankLines num : xs) = do + st <- get + case output st of + _ | newlines st > num -> return () + | otherwise -> replicateM_ (1 + num - newlines st) (outp (-1) "\n") + renderList xs + +renderList (CarriageReturn : BlankLines m : xs) = + renderList (BlankLines m : xs) + +renderList (CarriageReturn : xs) = do + st <- get + if newlines st > 0 || null xs + then renderList xs + else do + outp (-1) "\n" + renderList xs + +renderList (NewLine : xs) = do + outp (-1) "\n" + renderList xs + +renderList (BreakingSpace : CarriageReturn : xs) = renderList (CarriageReturn:xs) +renderList (BreakingSpace : NewLine : xs) = renderList (NewLine:xs) +renderList (BreakingSpace : BlankLines n : xs) = renderList (BlankLines n:xs) +renderList (BreakingSpace : BreakingSpace : xs) = renderList (BreakingSpace:xs) +renderList (BreakingSpace : xs) = do + let isText (Text _ _) = True + isText (Block _ _) = True + isText (AfterBreak _) = True + isText _ = False + let isBreakingSpace BreakingSpace = True + isBreakingSpace _ = False + let xs' = dropWhile isBreakingSpace xs + let next = takeWhile isText xs' + st <- get + let off = sum $ map offsetOf next + case lineLength st of + Just l | column st + 1 + off > l -> do + outp (-1) "\n" + renderList xs' + _ -> do + outp 1 " " + renderList xs' + +renderList (AfterBreak s : xs) = do + st <- get + if newlines st > 0 + then outp (realLength s) s + else return () + renderList xs + +renderList (Block i1 s1 : Block i2 s2 : xs) = + renderList (mergeBlocks False (IsBlock i1 s1) (IsBlock i2 s2) : xs) + +renderList (Block i1 s1 : BreakingSpace : Block i2 s2 : xs) = + renderList (mergeBlocks True (IsBlock i1 s1) (IsBlock i2 s2) : xs) + +renderList (Block _width lns : xs) = do + st <- get + let oldPref = prefix st + case column st - realLength oldPref of + n | n > 0 -> modify $ \s -> s{ prefix = oldPref ++ replicate n ' ' } + _ -> return () + renderList $ intersperse CarriageReturn (map (Text 0) lns) + modify $ \s -> s{ prefix = oldPref } + renderList xs + +mergeBlocks :: Bool -> IsBlock -> IsBlock -> D +mergeBlocks addSpace (IsBlock w1 lns1) (IsBlock w2 lns2) = + Block (w1 + w2 + if addSpace then 1 else 0) $ + zipWith (\l1 l2 -> pad w1 l1 ++ l2) lns1' (map sp lns2') + where (lns1', lns2') = case (length lns1, length lns2) of + (x, y) | x > y -> (lns1, + lns2 ++ replicate (x - y) "") + | x < y -> (lns1 ++ replicate (y - x) "", + lns2) + | otherwise -> (lns1, lns2) + pad n s = s ++ replicate (n - realLength s) ' ' + sp "" = "" + sp xs = if addSpace then (' ' : xs) else xs + +offsetOf :: D -> Int +offsetOf (Text o _) = o +offsetOf (Block w _) = w +offsetOf BreakingSpace = 1 +offsetOf _ = 0 + +-- | A literal string. +text :: String -> Doc +text = Doc . toChunks + where toChunks :: String -> Seq D + toChunks [] = mempty + toChunks s = case break (=='\n') s of + ([], _:ys) -> NewLine <| toChunks ys + (xs, _:ys) -> Text (realLength xs) xs <| + (NewLine <| toChunks ys) + (xs, []) -> singleton $ Text (realLength xs) xs + +-- | A character. +char :: Char -> Doc +char c = text [c] + +-- | A breaking (reflowable) space. +space :: Doc +space = Doc $ singleton BreakingSpace + +-- | A carriage return. Does nothing if we're at the beginning of +-- a line; otherwise inserts a newline. +cr :: Doc +cr = Doc $ singleton CarriageReturn + +-- | Inserts a blank line unless one exists already. +-- (@blankline <> blankline@ has the same effect as @blankline@. +blankline :: Doc +blankline = Doc $ singleton (BlankLines 1) + +-- | Inserts a blank lines unless they exists already. +-- (@blanklines m <> blanklines n@ has the same effect as @blankline (max m n)@. +blanklines :: Int -> Doc +blanklines n = Doc $ singleton (BlankLines n) + +-- | Uses the specified string as a prefix for every line of +-- the inside document (except the first, if not at the beginning +-- of the line). +prefixed :: String -> Doc -> Doc +prefixed pref doc = Doc $ singleton $ Prefixed pref doc + +-- | Makes a 'Doc' flush against the left margin. +flush :: Doc -> Doc +flush doc = Doc $ singleton $ Flush doc + +-- | Indents a 'Doc' by the specified number of spaces. +nest :: Int -> Doc -> Doc +nest ind = prefixed (replicate ind ' ') + +-- | A hanging indent. @hang ind start doc@ prints @start@, +-- then @doc@, leaving an indent of @ind@ spaces on every +-- line but the first. +hang :: Int -> Doc -> Doc -> Doc +hang ind start doc = start <> nest ind doc + +-- | @beforeNonBlank d@ conditionally includes @d@ unless it is +-- followed by blank space. +beforeNonBlank :: Doc -> Doc +beforeNonBlank d = Doc $ singleton (BeforeNonBlank d) + +-- | Makes a 'Doc' non-reflowable. +nowrap :: Doc -> Doc +nowrap doc = Doc $ mapWithIndex replaceSpace $ unDoc doc + where replaceSpace _ BreakingSpace = Text 1 " " + replaceSpace _ x = x + +-- | Content to print only if it comes at the beginning of a line, +-- to be used e.g. for escaping line-initial `.` in groff man. +afterBreak :: String -> Doc +afterBreak s = Doc $ singleton (AfterBreak s) + +-- | Returns the width of a 'Doc'. +offset :: Doc -> Int +offset d = case map realLength . lines . render Nothing $ d of + [] -> 0 + os -> maximum os + +-- | Returns the minimal width of a 'Doc' when reflowed at breakable spaces. +minOffset :: Doc -> Int +minOffset d = maximum (0: map realLength (lines $ render (Just 0) d)) + +-- | @lblock n d@ is a block of width @n@ characters, with +-- text derived from @d@ and aligned to the left. +lblock :: Int -> Doc -> Doc +lblock = block id + +-- | Like 'lblock' but aligned to the right. +rblock :: Int -> Doc -> Doc +rblock w = block (\s -> replicate (w - realLength s) ' ' ++ s) w + +-- | Like 'lblock' but aligned centered. +cblock :: Int -> Doc -> Doc +cblock w = block (\s -> replicate ((w - realLength s) `div` 2) ' ' ++ s) w + +-- | Returns the height of a block or other 'Doc'. +height :: Doc -> Int +height = length . lines . render Nothing + +block :: (String -> String) -> Int -> Doc -> Doc +block filler width d + | width < 1 && not (isEmpty d) = error "Text.Pandoc.Pretty.block: width < 1" + | otherwise = Doc $ singleton $ Block width $ map filler + $ chop width $ render (Just width) d + +chop :: Int -> String -> [String] +chop _ [] = [] +chop n cs = case break (=='\n') cs of + (xs, ys) -> if len <= n + then case ys of + [] -> [xs] + ['\n'] -> [xs] + (_:zs) -> xs : chop n zs + else take n xs : chop n (drop n xs ++ ys) + where len = realLength xs + +-- | Encloses a 'Doc' inside a start and end 'Doc'. +inside :: Doc -> Doc -> Doc -> Doc +inside start end contents = + start <> contents <> end + +-- | Puts a 'Doc' in curly braces. +braces :: Doc -> Doc +braces = inside (char '{') (char '}') + +-- | Puts a 'Doc' in square brackets. +brackets :: Doc -> Doc +brackets = inside (char '[') (char ']') + +-- | Puts a 'Doc' in parentheses. +parens :: Doc -> Doc +parens = inside (char '(') (char ')') + +-- | Wraps a 'Doc' in single quotes. +quotes :: Doc -> Doc +quotes = inside (char '\'') (char '\'') + +-- | Wraps a 'Doc' in double quotes. +doubleQuotes :: Doc -> Doc +doubleQuotes = inside (char '"') (char '"') + +-- | Returns width of a character in a monospace font: 0 for a combining +-- character, 1 for a regular character, 2 for an East Asian wide character. +charWidth :: Char -> Int +charWidth c = + case c of + _ | c < '\x0300' -> 1 + | c >= '\x0300' && c <= '\x036F' -> 0 -- combining + | c >= '\x0370' && c <= '\x10FC' -> 1 + | c >= '\x1100' && c <= '\x115F' -> 2 + | c >= '\x1160' && c <= '\x11A2' -> 1 + | c >= '\x11A3' && c <= '\x11A7' -> 2 + | c >= '\x11A8' && c <= '\x11F9' -> 1 + | c >= '\x11FA' && c <= '\x11FF' -> 2 + | c >= '\x1200' && c <= '\x2328' -> 1 + | c >= '\x2329' && c <= '\x232A' -> 2 + | c >= '\x232B' && c <= '\x2E31' -> 1 + | c >= '\x2E80' && c <= '\x303E' -> 2 + | c == '\x303F' -> 1 + | c >= '\x3041' && c <= '\x3247' -> 2 + | c >= '\x3248' && c <= '\x324F' -> 1 -- ambiguous + | c >= '\x3250' && c <= '\x4DBF' -> 2 + | c >= '\x4DC0' && c <= '\x4DFF' -> 1 + | c >= '\x4E00' && c <= '\xA4C6' -> 2 + | c >= '\xA4D0' && c <= '\xA95F' -> 1 + | c >= '\xA960' && c <= '\xA97C' -> 2 + | c >= '\xA980' && c <= '\xABF9' -> 1 + | c >= '\xAC00' && c <= '\xD7FB' -> 2 + | c >= '\xD800' && c <= '\xDFFF' -> 1 + | c >= '\xE000' && c <= '\xF8FF' -> 1 -- ambiguous + | c >= '\xF900' && c <= '\xFAFF' -> 2 + | c >= '\xFB00' && c <= '\xFDFD' -> 1 + | c >= '\xFE00' && c <= '\xFE0F' -> 1 -- ambiguous + | c >= '\xFE10' && c <= '\xFE19' -> 2 + | c >= '\xFE20' && c <= '\xFE26' -> 1 + | c >= '\xFE30' && c <= '\xFE6B' -> 2 + | c >= '\xFE70' && c <= '\xFEFF' -> 1 + | c >= '\xFF01' && c <= '\xFF60' -> 2 + | c >= '\xFF61' && c <= '\x16A38' -> 1 + | c >= '\x1B000' && c <= '\x1B001' -> 2 + | c >= '\x1D000' && c <= '\x1F1FF' -> 1 + | c >= '\x1F200' && c <= '\x1F251' -> 2 + | c >= '\x1F300' && c <= '\x1F773' -> 1 + | c >= '\x20000' && c <= '\x3FFFD' -> 2 + | otherwise -> 1 + +-- | Get real length of string, taking into account combining and double-wide +-- characters. +realLength :: String -> Int +realLength = foldr (\a b -> charWidth a + b) 0 diff --git a/src/Text/Pandoc/Process.hs b/src/Text/Pandoc/Process.hs new file mode 100644 index 000000000..294a38a1b --- /dev/null +++ b/src/Text/Pandoc/Process.hs @@ -0,0 +1,98 @@ +{- +Copyright (C) 2013-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Process + Copyright : Copyright (C) 2013-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +ByteString variant of 'readProcessWithExitCode'. +-} +module Text.Pandoc.Process (pipeProcess) +where +import System.Process +import System.Exit (ExitCode (..)) +import Control.Exception +import System.IO (hClose, hFlush) +import Control.Concurrent (putMVar, takeMVar, newEmptyMVar, forkIO) +import Control.Monad (unless) +import qualified Data.ByteString.Lazy as BL + +{- | +Version of 'System.Process.readProcessWithExitCode' that uses lazy bytestrings +instead of strings and allows setting environment variables. + +@readProcessWithExitCode@ creates an external process, reads its +standard output strictly, waits until the process +terminates, and then returns the 'ExitCode' of the process +and the standard output. stderr is inherited from the parent. + +If an asynchronous exception is thrown to the thread executing +@readProcessWithExitCode@, the forked process will be terminated and +@readProcessWithExitCode@ will wait (block) until the process has been +terminated. +-} + +pipeProcess + :: Maybe [(String, String)] -- ^ environment variables + -> FilePath -- ^ Filename of the executable (see 'proc' for details) + -> [String] -- ^ any arguments + -> BL.ByteString -- ^ standard input + -> IO (ExitCode,BL.ByteString) -- ^ exitcode, stdout +pipeProcess mbenv cmd args input = + mask $ \restore -> do + (Just inh, Just outh, Nothing, pid) <- createProcess (proc cmd args) + { env = mbenv, + std_in = CreatePipe, + std_out = CreatePipe, + std_err = Inherit } + flip onException + (do hClose inh; hClose outh; + terminateProcess pid; waitForProcess pid) $ restore $ do + -- fork off a thread to start consuming stdout + out <- BL.hGetContents outh + waitOut <- forkWait $ evaluate $ BL.length out + + -- now write and flush any input + let writeInput = do + unless (BL.null input) $ do + BL.hPutStr inh input + hFlush inh + hClose inh + + writeInput + + -- wait on the output + waitOut + + hClose outh + + -- wait on the process + ex <- waitForProcess pid + + return (ex, out) + +forkWait :: IO a -> IO (IO a) +forkWait a = do + res <- newEmptyMVar + _ <- mask $ \restore -> forkIO $ try (restore a) >>= putMVar res + return (takeMVar res >>= either (\ex -> throwIO (ex :: SomeException)) return) diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs new file mode 100644 index 000000000..b0bcbd580 --- /dev/null +++ b/src/Text/Pandoc/Readers/CommonMark.hs @@ -0,0 +1,128 @@ +{- +Copyright (C) 2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.CommonMark + Copyright : Copyright (C) 2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of CommonMark-formatted plain text to 'Pandoc' document. + +CommonMark is a strongly specified variant of Markdown: http://commonmark.org. +-} +module Text.Pandoc.Readers.CommonMark (readCommonMark) +where + +import CMark +import Data.Text (unpack, pack) +import Data.List (groupBy) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Class (PandocMonad) + +-- | Parse a CommonMark formatted string into a 'Pandoc' structure. +readCommonMark :: PandocMonad m => ReaderOptions -> String -> m Pandoc +readCommonMark opts s = return $ + nodeToPandoc $ commonmarkToNode opts' $ pack s + where opts' = if extensionEnabled Ext_smart (readerExtensions opts) + then [optNormalize, optSmart] + else [optNormalize] + +nodeToPandoc :: Node -> Pandoc +nodeToPandoc (Node _ DOCUMENT nodes) = + Pandoc nullMeta $ foldr addBlock [] nodes +nodeToPandoc n = -- shouldn't happen + Pandoc nullMeta $ foldr addBlock [] [n] + +addBlocks :: [Node] -> [Block] +addBlocks = foldr addBlock [] + +addBlock :: Node -> [Block] -> [Block] +addBlock (Node _ PARAGRAPH nodes) = + (Para (addInlines nodes) :) +addBlock (Node _ THEMATIC_BREAK _) = + (HorizontalRule :) +addBlock (Node _ BLOCK_QUOTE nodes) = + (BlockQuote (addBlocks nodes) :) +addBlock (Node _ (HTML_BLOCK t) _) = + (RawBlock (Format "html") (unpack t) :) +-- Note: the cmark parser will never generate CUSTOM_BLOCK, +-- so we don't need to handle it: +addBlock (Node _ (CUSTOM_BLOCK _onEnter _onExit) _nodes) = + id +addBlock (Node _ (CODE_BLOCK info t) _) = + (CodeBlock ("", take 1 (words (unpack info)), []) (unpack t) :) +addBlock (Node _ (HEADING lev) nodes) = + (Header lev ("",[],[]) (addInlines nodes) :) +addBlock (Node _ (LIST listAttrs) nodes) = + (constructor (map (setTightness . addBlocks . children) nodes) :) + where constructor = case listType listAttrs of + BULLET_LIST -> BulletList + ORDERED_LIST -> OrderedList + (start, DefaultStyle, delim) + start = listStart listAttrs + setTightness = if listTight listAttrs + then map paraToPlain + else id + paraToPlain (Para xs) = Plain (xs) + paraToPlain x = x + delim = case listDelim listAttrs of + PERIOD_DELIM -> Period + PAREN_DELIM -> OneParen +addBlock (Node _ ITEM _) = id -- handled in LIST +addBlock _ = id + +children :: Node -> [Node] +children (Node _ _ ns) = ns + +addInlines :: [Node] -> [Inline] +addInlines = foldr addInline [] + +addInline :: Node -> [Inline] -> [Inline] +addInline (Node _ (TEXT t) _) = (map toinl clumps ++) + where raw = unpack t + clumps = groupBy samekind raw + samekind ' ' ' ' = True + samekind ' ' _ = False + samekind _ ' ' = False + samekind _ _ = True + toinl (' ':_) = Space + toinl xs = Str xs +addInline (Node _ LINEBREAK _) = (LineBreak :) +addInline (Node _ SOFTBREAK _) = (SoftBreak :) +addInline (Node _ (HTML_INLINE t) _) = + (RawInline (Format "html") (unpack t) :) +-- Note: the cmark parser will never generate CUSTOM_BLOCK, +-- so we don't need to handle it: +addInline (Node _ (CUSTOM_INLINE _onEnter _onExit) _nodes) = + id +addInline (Node _ (CODE t) _) = + (Code ("",[],[]) (unpack t) :) +addInline (Node _ EMPH nodes) = + (Emph (addInlines nodes) :) +addInline (Node _ STRONG nodes) = + (Strong (addInlines nodes) :) +addInline (Node _ (LINK url title) nodes) = + (Link nullAttr (addInlines nodes) (unpack url, unpack title) :) +addInline (Node _ (IMAGE url title) nodes) = + (Image nullAttr (addInlines nodes) (unpack url, unpack title) :) +addInline _ = id diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs new file mode 100644 index 000000000..bef256a93 --- /dev/null +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -0,0 +1,1055 @@ +module Text.Pandoc.Readers.DocBook ( readDocBook ) where +import Data.Char (toUpper) +import Text.Pandoc.Shared (safeRead) +import Text.Pandoc.Options +import Text.Pandoc.Definition +import Text.Pandoc.Builder +import Text.XML.Light +import Text.HTML.TagSoup.Entity (lookupEntity) +import Data.Either (rights) +import Data.Generics +import Data.Char (isSpace) +import Control.Monad.State +import Data.List (intersperse) +import Data.Maybe (fromMaybe) +import Text.TeXMath (readMathML, writeTeX) +import Data.Default +import Data.Foldable (asum) +import Text.Pandoc.Class (PandocMonad) + +{- + +List of all DocBook tags, with [x] indicating implemented, +[o] meaning intentionally left unimplemented (pass through): + +[o] abbrev - An abbreviation, especially one followed by a period +[x] abstract - A summary +[o] accel - A graphical user interface (GUI) keyboard shortcut +[x] ackno - Acknowledgements in an Article +[o] acronym - An often pronounceable word made from the initial +[o] action - A response to a user event +[o] address - A real-world address, generally a postal address +[ ] affiliation - The institutional affiliation of an individual +[ ] alt - Text representation for a graphical element +[o] anchor - A spot in the document +[x] answer - An answer to a question posed in a QandASet +[x] appendix - An appendix in a Book or Article +[x] appendixinfo - Meta-information for an Appendix +[o] application - The name of a software program +[x] area - A region defined for a Callout in a graphic or code example +[x] areaset - A set of related areas in a graphic or code example +[x] areaspec - A collection of regions in a graphic or code example +[ ] arg - An argument in a CmdSynopsis +[x] article - An article +[x] articleinfo - Meta-information for an Article +[ ] artpagenums - The page numbers of an article as published +[x] attribution - The source of a block quote or epigraph +[ ] audiodata - Pointer to external audio data +[ ] audioobject - A wrapper for audio data and its associated meta-information +[x] author - The name of an individual author +[ ] authorblurb - A short description or note about an author +[x] authorgroup - Wrapper for author information when a document has + multiple authors or collabarators +[x] authorinitials - The initials or other short identifier for an author +[o] beginpage - The location of a page break in a print version of the document +[ ] bibliocoverage - The spatial or temporal coverage of a document +[x] bibliodiv - A section of a Bibliography +[x] biblioentry - An entry in a Bibliography +[x] bibliography - A bibliography +[ ] bibliographyinfo - Meta-information for a Bibliography +[ ] biblioid - An identifier for a document +[o] bibliolist - A wrapper for a set of bibliography entries +[ ] bibliomisc - Untyped bibliographic information +[x] bibliomixed - An entry in a Bibliography +[ ] bibliomset - A cooked container for related bibliographic information +[ ] biblioref - A cross reference to a bibliographic entry +[ ] bibliorelation - The relationship of a document to another +[ ] biblioset - A raw container for related bibliographic information +[ ] bibliosource - The source of a document +[ ] blockinfo - Meta-information for a block element +[x] blockquote - A quotation set off from the main text +[x] book - A book +[x] bookinfo - Meta-information for a Book +[x] bridgehead - A free-floating heading +[x] callout - A “called out” description of a marked Area +[x] calloutlist - A list of Callouts +[x] caption - A caption +[x] caution - A note of caution +[x] chapter - A chapter, as of a book +[x] chapterinfo - Meta-information for a Chapter +[ ] citation - An inline bibliographic reference to another published work +[ ] citebiblioid - A citation of a bibliographic identifier +[ ] citerefentry - A citation to a reference page +[ ] citetitle - The title of a cited work +[ ] city - The name of a city in an address +[x] classname - The name of a class, in the object-oriented programming sense +[ ] classsynopsis - The syntax summary for a class definition +[ ] classsynopsisinfo - Information supplementing the contents of + a ClassSynopsis +[ ] cmdsynopsis - A syntax summary for a software command +[ ] co - The location of a callout embedded in text +[x] code - An inline code fragment +[x] col - Specifications for a column in an HTML table +[x] colgroup - A group of columns in an HTML table +[ ] collab - Identifies a collaborator +[ ] collabname - The name of a collaborator +[ ] colophon - Text at the back of a book describing facts about its production +[x] colspec - Specifications for a column in a table +[x] command - The name of an executable program or other software command +[x] computeroutput - Data, generally text, displayed or presented by a computer +[ ] confdates - The dates of a conference for which a document was written +[ ] confgroup - A wrapper for document meta-information about a conference +[ ] confnum - An identifier, frequently numerical, associated with a conference for which a document was written +[ ] confsponsor - The sponsor of a conference for which a document was written +[ ] conftitle - The title of a conference for which a document was written +[x] constant - A programming or system constant +[ ] constraint - A constraint in an EBNF production +[ ] constraintdef - The definition of a constraint in an EBNF production +[ ] constructorsynopsis - A syntax summary for a constructor +[ ] contractnum - The contract number of a document +[ ] contractsponsor - The sponsor of a contract +[ ] contrib - A summary of the contributions made to a document by a + credited source +[ ] copyright - Copyright information about a document +[ ] coref - A cross reference to a co +[ ] corpauthor - A corporate author, as opposed to an individual +[ ] corpcredit - A corporation or organization credited in a document +[ ] corpname - The name of a corporation +[ ] country - The name of a country +[ ] database - The name of a database, or part of a database +[x] date - The date of publication or revision of a document +[ ] dedication - A wrapper for the dedication section of a book +[ ] destructorsynopsis - A syntax summary for a destructor +[ ] edition - The name or number of an edition of a document +[ ] editor - The name of the editor of a document +[x] email - An email address +[x] emphasis - Emphasized text +[x] entry - A cell in a table +[ ] entrytbl - A subtable appearing in place of an Entry in a table +[ ] envar - A software environment variable +[x] epigraph - A short inscription at the beginning of a document or component + note: also handle embedded attribution tag +[x] equation - A displayed mathematical equation +[ ] errorcode - An error code +[ ] errorname - An error name +[ ] errortext - An error message. +[ ] errortype - The classification of an error message +[ ] example - A formal example, with a title +[ ] exceptionname - The name of an exception +[ ] fax - A fax number +[ ] fieldsynopsis - The name of a field in a class definition +[x] figure - A formal figure, generally an illustration, with a title +[x] filename - The name of a file +[ ] firstname - The first name of a person +[ ] firstterm - The first occurrence of a term +[x] footnote - A footnote +[ ] footnoteref - A cross reference to a footnote (a footnote mark) +[x] foreignphrase - A word or phrase in a language other than the primary + language of the document +[x] formalpara - A paragraph with a title +[ ] funcdef - A function (subroutine) name and its return type +[ ] funcparams - Parameters for a function referenced through a function + pointer in a synopsis +[ ] funcprototype - The prototype of a function +[ ] funcsynopsis - The syntax summary for a function definition +[ ] funcsynopsisinfo - Information supplementing the FuncDefs of a FuncSynopsis +[x] function - The name of a function or subroutine, as in a + programming language +[x] glossary - A glossary +[x] glossaryinfo - Meta-information for a Glossary +[x] glossdef - A definition in a GlossEntry +[x] glossdiv - A division in a Glossary +[x] glossentry - An entry in a Glossary or GlossList +[x] glosslist - A wrapper for a set of GlossEntrys +[x] glosssee - A cross-reference from one GlossEntry to another +[x] glossseealso - A cross-reference from one GlossEntry to another +[x] glossterm - A glossary term +[ ] graphic - A displayed graphical object (not an inline) + Note: in DocBook v5 `graphic` is discarded +[ ] graphicco - A graphic that contains callout areas + Note: in DocBook v5 `graphicco` is discarded +[ ] group - A group of elements in a CmdSynopsis +[ ] guibutton - The text on a button in a GUI +[ ] guiicon - Graphic and/or text appearing as a icon in a GUI +[ ] guilabel - The text of a label in a GUI +[x] guimenu - The name of a menu in a GUI +[x] guimenuitem - The name of a terminal menu item in a GUI +[x] guisubmenu - The name of a submenu in a GUI +[ ] hardware - A physical part of a computer system +[ ] highlights - A summary of the main points of the discussed component +[ ] holder - The name of the individual or organization that holds a copyright +[o] honorific - The title of a person +[ ] html:form - An HTML form +[x] imagedata - Pointer to external image data (only `fileref` attribute + implemented but not `entityref` which would require parsing of the DTD) +[x] imageobject - A wrapper for image data and its associated meta-information +[ ] imageobjectco - A wrapper for an image object with callouts +[x] important - An admonition set off from the text +[x] index - An index +[x] indexdiv - A division in an index +[x] indexentry - An entry in an index +[x] indexinfo - Meta-information for an Index +[x] indexterm - A wrapper for terms to be indexed +[x] info - A wrapper for information about a component or other block. (DocBook v5) +[x] informalequation - A displayed mathematical equation without a title +[x] informalexample - A displayed example without a title +[ ] informalfigure - A untitled figure +[ ] informaltable - A table without a title +[ ] initializer - The initializer for a FieldSynopsis +[x] inlineequation - A mathematical equation or expression occurring inline +[ ] inlinegraphic - An object containing or pointing to graphical data + that will be rendered inline +[x] inlinemediaobject - An inline media object (video, audio, image, and so on) +[ ] interface - An element of a GUI +[ ] interfacename - The name of an interface +[ ] invpartnumber - An inventory part number +[ ] isbn - The International Standard Book Number of a document +[ ] issn - The International Standard Serial Number of a periodical +[ ] issuenum - The number of an issue of a journal +[x] itemizedlist - A list in which each entry is marked with a bullet or + other dingbat +[ ] itermset - A set of index terms in the meta-information of a document +[ ] jobtitle - The title of an individual in an organization +[x] keycap - The text printed on a key on a keyboard +[ ] keycode - The internal, frequently numeric, identifier for a key + on a keyboard +[x] keycombo - A combination of input actions +[ ] keysym - The symbolic name of a key on a keyboard +[ ] keyword - One of a set of keywords describing the content of a document +[ ] keywordset - A set of keywords describing the content of a document +[ ] label - A label on a Question or Answer +[ ] legalnotice - A statement of legal obligations or requirements +[ ] lhs - The left-hand side of an EBNF production +[ ] lineage - The portion of a person's name indicating a relationship to + ancestors +[ ] lineannotation - A comment on a line in a verbatim listing +[x] link - A hypertext link +[x] listitem - A wrapper for the elements of a list item +[x] literal - Inline text that is some literal value +[x] literallayout - A block of text in which line breaks and white space are + to be reproduced faithfully +[ ] lot - A list of the titles of formal objects (as tables or figures) in + a document +[ ] lotentry - An entry in a list of titles +[ ] manvolnum - A reference volume number +[x] markup - A string of formatting markup in text that is to be + represented literally +[ ] mathphrase - A mathematical phrase, an expression that can be represented + with ordinary text and a small amount of markup +[ ] medialabel - A name that identifies the physical medium on which some + information resides +[x] mediaobject - A displayed media object (video, audio, image, etc.) +[ ] mediaobjectco - A media object that contains callouts +[x] member - An element of a simple list +[x] menuchoice - A selection or series of selections from a menu +[ ] methodname - The name of a method +[ ] methodparam - Parameters to a method +[ ] methodsynopsis - A syntax summary for a method +[x] mml:math - A MathML equation +[ ] modespec - Application-specific information necessary for the + completion of an OLink +[ ] modifier - Modifiers in a synopsis +[ ] mousebutton - The conventional name of a mouse button +[ ] msg - A message in a message set +[ ] msgaud - The audience to which a message in a message set is relevant +[ ] msgentry - A wrapper for an entry in a message set +[ ] msgexplan - Explanatory material relating to a message in a message set +[ ] msginfo - Information about a message in a message set +[ ] msglevel - The level of importance or severity of a message in a message set +[ ] msgmain - The primary component of a message in a message set +[ ] msgorig - The origin of a message in a message set +[ ] msgrel - A related component of a message in a message set +[ ] msgset - A detailed set of messages, usually error messages +[ ] msgsub - A subcomponent of a message in a message set +[ ] msgtext - The actual text of a message component in a message set +[ ] nonterminal - A non-terminal in an EBNF production +[x] note - A message set off from the text +[ ] objectinfo - Meta-information for an object +[ ] olink - A link that addresses its target indirectly, through an entity +[ ] ooclass - A class in an object-oriented programming language +[ ] ooexception - An exception in an object-oriented programming language +[ ] oointerface - An interface in an object-oriented programming language +[x] option - An option for a software command +[x] optional - Optional information +[x] orderedlist - A list in which each entry is marked with a sequentially + incremented label +[ ] orgdiv - A division of an organization +[ ] orgname - The name of an organization other than a corporation +[ ] otheraddr - Uncategorized information in address +[ ] othercredit - A person or entity, other than an author or editor, + credited in a document +[ ] othername - A component of a persons name that is not a first name, + surname, or lineage +[ ] package - A package +[ ] pagenums - The numbers of the pages in a book, for use in a bibliographic + entry +[x] para - A paragraph +[ ] paramdef - Information about a function parameter in a programming language +[x] parameter - A value or a symbolic reference to a value +[ ] part - A division in a book +[ ] partinfo - Meta-information for a Part +[ ] partintro - An introduction to the contents of a part +[ ] personblurb - A short description or note about a person +[ ] personname - The personal name of an individual +[ ] phone - A telephone number +[ ] phrase - A span of text +[ ] pob - A post office box in an address +[ ] postcode - A postal code in an address +[x] preface - Introductory matter preceding the first chapter of a book +[ ] prefaceinfo - Meta-information for a Preface +[ ] primary - The primary word or phrase under which an index term should be + sorted +[ ] primaryie - A primary term in an index entry, not in the text +[ ] printhistory - The printing history of a document +[ ] procedure - A list of operations to be performed in a well-defined sequence +[ ] production - A production in a set of EBNF productions +[ ] productionrecap - A cross-reference to an EBNF production +[ ] productionset - A set of EBNF productions +[ ] productname - The formal name of a product +[ ] productnumber - A number assigned to a product +[x] programlisting - A literal listing of all or part of a program +[ ] programlistingco - A program listing with associated areas used in callouts +[x] prompt - A character or string indicating the start of an input field in + a computer display +[ ] property - A unit of data associated with some part of a computer system +[ ] pubdate - The date of publication of a document +[ ] publisher - The publisher of a document +[ ] publishername - The name of the publisher of a document +[ ] pubsnumber - A number assigned to a publication other than an ISBN or ISSN + or inventory part number +[x] qandadiv - A titled division in a QandASet +[o] qandaentry - A question/answer set within a QandASet +[o] qandaset - A question-and-answer set +[x] question - A question in a QandASet +[x] quote - An inline quotation +[ ] refclass - The scope or other indication of applicability of a + reference entry +[ ] refdescriptor - A description of the topic of a reference page +[ ] refentry - A reference page (originally a UNIX man-style reference page) +[ ] refentryinfo - Meta-information for a Refentry +[ ] refentrytitle - The title of a reference page +[ ] reference - A collection of reference entries +[ ] referenceinfo - Meta-information for a Reference +[ ] refmeta - Meta-information for a reference entry +[ ] refmiscinfo - Meta-information for a reference entry other than the title + and volume number +[ ] refname - The name of (one of) the subject(s) of a reference page +[ ] refnamediv - The name, purpose, and classification of a reference page +[ ] refpurpose - A short (one sentence) synopsis of the topic of a reference + page +[x] refsect1 - A major subsection of a reference entry +[x] refsect1info - Meta-information for a RefSect1 +[x] refsect2 - A subsection of a RefSect1 +[x] refsect2info - Meta-information for a RefSect2 +[x] refsect3 - A subsection of a RefSect2 +[x] refsect3info - Meta-information for a RefSect3 +[x] refsection - A recursive section in a refentry +[x] refsectioninfo - Meta-information for a refsection +[ ] refsynopsisdiv - A syntactic synopsis of the subject of the reference page +[ ] refsynopsisdivinfo - Meta-information for a RefSynopsisDiv +[x] releaseinfo - Information about a particular release of a document +[ ] remark - A remark (or comment) intended for presentation in a draft + manuscript +[ ] replaceable - Content that may or must be replaced by the user +[ ] returnvalue - The value returned by a function +[ ] revdescription - A extended description of a revision to a document +[ ] revhistory - A history of the revisions to a document +[ ] revision - An entry describing a single revision in the history of the + revisions to a document +[ ] revnumber - A document revision number +[ ] revremark - A description of a revision to a document +[ ] rhs - The right-hand side of an EBNF production +[x] row - A row in a table +[ ] sbr - An explicit line break in a command synopsis +[x] screen - Text that a user sees or might see on a computer screen +[o] screenco - A screen with associated areas used in callouts +[o] screeninfo - Information about how a screen shot was produced +[ ] screenshot - A representation of what the user sees or might see on a + computer screen +[ ] secondary - A secondary word or phrase in an index term +[ ] secondaryie - A secondary term in an index entry, rather than in the text +[x] sect1 - A top-level section of document +[x] sect1info - Meta-information for a Sect1 +[x] sect2 - A subsection within a Sect1 +[x] sect2info - Meta-information for a Sect2 +[x] sect3 - A subsection within a Sect2 +[x] sect3info - Meta-information for a Sect3 +[x] sect4 - A subsection within a Sect3 +[x] sect4info - Meta-information for a Sect4 +[x] sect5 - A subsection within a Sect4 +[x] sect5info - Meta-information for a Sect5 +[x] section - A recursive section +[x] sectioninfo - Meta-information for a recursive section +[x] see - Part of an index term directing the reader instead to another entry + in the index +[x] seealso - Part of an index term directing the reader also to another entry + in the index +[ ] seealsoie - A See also entry in an index, rather than in the text +[ ] seeie - A See entry in an index, rather than in the text +[x] seg - An element of a list item in a segmented list +[x] seglistitem - A list item in a segmented list +[x] segmentedlist - A segmented list, a list of sets of elements +[x] segtitle - The title of an element of a list item in a segmented list +[ ] seriesvolnums - Numbers of the volumes in a series of books +[ ] set - A collection of books +[ ] setindex - An index to a set of books +[ ] setindexinfo - Meta-information for a SetIndex +[ ] setinfo - Meta-information for a Set +[ ] sgmltag - A component of SGML markup +[ ] shortaffil - A brief description of an affiliation +[ ] shortcut - A key combination for an action that is also accessible through + a menu +[ ] sidebar - A portion of a document that is isolated from the main + narrative flow +[ ] sidebarinfo - Meta-information for a Sidebar +[x] simpara - A paragraph that contains only text and inline markup, no block + elements +[x] simplelist - An undecorated list of single words or short phrases +[ ] simplemsgentry - A wrapper for a simpler entry in a message set +[ ] simplesect - A section of a document with no subdivisions +[ ] spanspec - Formatting information for a spanned column in a table +[ ] state - A state or province in an address +[ ] step - A unit of action in a procedure +[ ] stepalternatives - Alternative steps in a procedure +[ ] street - A street address in an address +[ ] structfield - A field in a structure (in the programming language sense) +[ ] structname - The name of a structure (in the programming language sense) +[ ] subject - One of a group of terms describing the subject matter of a + document +[ ] subjectset - A set of terms describing the subject matter of a document +[ ] subjectterm - A term in a group of terms describing the subject matter of + a document +[x] subscript - A subscript (as in H2O, the molecular formula for water) +[ ] substeps - A wrapper for steps that occur within steps in a procedure +[x] subtitle - The subtitle of a document +[x] superscript - A superscript (as in x2, the mathematical notation for x + multiplied by itself) +[ ] surname - A family name; in western cultures the last name +[ ] svg:svg - An SVG graphic +[x] symbol - A name that is replaced by a value before processing +[ ] synopfragment - A portion of a CmdSynopsis broken out from the main body + of the synopsis +[ ] synopfragmentref - A reference to a fragment of a command synopsis +[ ] synopsis - A general-purpose element for representing the syntax of + commands or functions +[ ] systemitem - A system-related item or term +[ ] table - A formal table in a document +[ ] task - A task to be completed +[ ] taskprerequisites - The prerequisites for a task +[ ] taskrelated - Information related to a task +[ ] tasksummary - A summary of a task +[x] tbody - A wrapper for the rows of a table or informal table +[x] td - A table entry in an HTML table +[x] term - The word or phrase being defined or described in a variable list +[ ] termdef - An inline term definition +[ ] tertiary - A tertiary word or phrase in an index term +[ ] tertiaryie - A tertiary term in an index entry, rather than in the text +[ ] textdata - Pointer to external text data +[ ] textobject - A wrapper for a text description of an object and its + associated meta-information +[ ] tfoot - A table footer consisting of one or more rows +[x] tgroup - A wrapper for the main content of a table, or part of a table +[x] th - A table header entry in an HTML table +[x] thead - A table header consisting of one or more rows +[x] tip - A suggestion to the user, set off from the text +[x] title - The text of the title of a section of a document or of a formal + block-level element +[x] titleabbrev - The abbreviation of a Title +[x] toc - A table of contents +[x] tocback - An entry in a table of contents for a back matter component +[x] tocchap - An entry in a table of contents for a component in the body of + a document +[x] tocentry - A component title in a table of contents +[x] tocfront - An entry in a table of contents for a front matter component +[x] toclevel1 - A top-level entry within a table of contents entry for a + chapter-like component +[x] toclevel2 - A second-level entry within a table of contents entry for a + chapter-like component +[x] toclevel3 - A third-level entry within a table of contents entry for a + chapter-like component +[x] toclevel4 - A fourth-level entry within a table of contents entry for a + chapter-like component +[x] toclevel5 - A fifth-level entry within a table of contents entry for a + chapter-like component +[x] tocpart - An entry in a table of contents for a part of a book +[ ] token - A unit of information +[x] tr - A row in an HTML table +[ ] trademark - A trademark +[x] type - The classification of a value +[x] ulink - A link that addresses its target by means of a URL + (Uniform Resource Locator) +[x] uri - A Uniform Resource Identifier +[x] userinput - Data entered by the user +[x] varargs - An empty element in a function synopsis indicating a variable + number of arguments +[x] variablelist - A list in which each entry is composed of a set of one or + more terms and an associated description +[x] varlistentry - A wrapper for a set of terms and the associated description + in a variable list +[x] varname - The name of a variable +[ ] videodata - Pointer to external video data +[ ] videoobject - A wrapper for video data and its associated meta-information +[ ] void - An empty element in a function synopsis indicating that the + function in question takes no arguments +[ ] volumenum - The volume number of a document in a set (as of books in a set + or articles in a journal) +[x] warning - An admonition set off from the text +[x] wordasword - A word meant specifically as a word and not representing + anything else +[x] xref - A cross reference to another part of the document +[ ] year - The year of publication of a document +[x] ?asciidoc-br? - line break from asciidoc docbook output +-} + +type DB m = StateT DBState m + +data DBState = DBState{ dbSectionLevel :: Int + , dbQuoteType :: QuoteType + , dbMeta :: Meta + , dbAcceptsMeta :: Bool + , dbBook :: Bool + , dbFigureTitle :: Inlines + , dbContent :: [Content] + } deriving Show + +instance Default DBState where + def = DBState{ dbSectionLevel = 0 + , dbQuoteType = DoubleQuote + , dbMeta = mempty + , dbAcceptsMeta = False + , dbBook = False + , dbFigureTitle = mempty + , dbContent = [] } + + +readDocBook :: PandocMonad m => ReaderOptions -> String -> m Pandoc +readDocBook _ inp = do + let tree = normalizeTree . parseXML . handleInstructions $ inp + (bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock $ tree + return $ Pandoc (dbMeta st') (toList . mconcat $ bs) + +-- We treat <?asciidoc-br?> specially (issue #1236), converting it +-- to <br/>, since xml-light doesn't parse the instruction correctly. +-- Other xml instructions are simply removed from the input stream. +handleInstructions :: String -> String +handleInstructions ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions xs +handleInstructions xs = case break (=='<') xs of + (ys, []) -> ys + ([], '<':zs) -> '<' : handleInstructions zs + (ys, zs) -> ys ++ handleInstructions zs + +getFigure :: PandocMonad m => Element -> DB m Blocks +getFigure e = do + tit <- case filterChild (named "title") e of + Just t -> getInlines t + Nothing -> return mempty + modify $ \st -> st{ dbFigureTitle = tit } + res <- getBlocks e + modify $ \st -> st{ dbFigureTitle = mempty } + return res + +-- normalize input, consolidating adjacent Text and CRef elements +normalizeTree :: [Content] -> [Content] +normalizeTree = everywhere (mkT go) + where go :: [Content] -> [Content] + go (Text (CData CDataRaw _ _):xs) = xs + go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = + Text (CData CDataText (s1 ++ s2) z):xs + go (Text (CData CDataText s1 z):CRef r:xs) = + Text (CData CDataText (s1 ++ convertEntity r) z):xs + go (CRef r:Text (CData CDataText s1 z):xs) = + Text (CData CDataText (convertEntity r ++ s1) z):xs + go (CRef r1:CRef r2:xs) = + Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + go xs = xs + +convertEntity :: String -> String +convertEntity e = maybe (map toUpper e) id (lookupEntity e) + +-- convenience function to get an attribute value, defaulting to "" +attrValue :: String -> Element -> String +attrValue attr elt = + case lookupAttrBy (\x -> qName x == attr) (elAttribs elt) of + Just z -> z + Nothing -> "" + +-- convenience function +named :: String -> Element -> Bool +named s e = qName (elName e) == s + +-- + +acceptingMetadata :: PandocMonad m => DB m a -> DB m a +acceptingMetadata p = do + modify (\s -> s { dbAcceptsMeta = True } ) + res <- p + modify (\s -> s { dbAcceptsMeta = False }) + return res + +checkInMeta :: (PandocMonad m, Monoid a) => DB m () -> DB m a +checkInMeta p = do + accepts <- dbAcceptsMeta <$> get + when accepts p + return mempty + +addMeta :: PandocMonad m => ToMetaValue a => String -> a -> DB m () +addMeta field val = modify (setMeta field val) + +instance HasMeta DBState where + setMeta field v s = s {dbMeta = setMeta field v (dbMeta s)} + deleteMeta field s = s {dbMeta = deleteMeta field (dbMeta s)} + +isBlockElement :: Content -> Bool +isBlockElement (Elem e) = qName (elName e) `elem` blocktags + where blocktags = ["toc","index","para","formalpara","simpara", + "ackno","epigraph","blockquote","bibliography","bibliodiv", + "biblioentry","glossee","glosseealso","glossary", + "glossdiv","glosslist","chapter","appendix","preface", + "bridgehead","sect1","sect2","sect3","sect4","sect5","section", + "refsect1","refsect2","refsect3","refsection", + "important","caution","note","tip","warning","qandadiv", + "question","answer","abstract","itemizedlist","orderedlist", + "variablelist","article","book","table","informaltable", + "informalexample", "linegroup", + "screen","programlisting","example","calloutlist"] +isBlockElement _ = False + +-- Trim leading and trailing newline characters +trimNl :: String -> String +trimNl = reverse . go . reverse . go + where go ('\n':xs) = xs + go xs = xs + +-- meld text into beginning of first paragraph of Blocks. +-- assumes Blocks start with a Para; if not, does nothing. +addToStart :: Inlines -> Blocks -> Blocks +addToStart toadd bs = + case toList bs of + (Para xs : rest) -> para (toadd <> fromList xs) <> fromList rest + _ -> bs + +-- function that is used by both mediaobject (in parseBlock) +-- and inlinemediaobject (in parseInline) +-- A DocBook mediaobject is a wrapper around a set of alternative presentations +getMediaobject :: PandocMonad m => Element -> DB m Inlines +getMediaobject e = do + (imageUrl, attr) <- + case filterChild (named "imageobject") e of + Nothing -> return (mempty, nullAttr) + Just z -> case filterChild (named "imagedata") z of + Nothing -> return (mempty, nullAttr) + Just i -> let atVal a = attrValue a i + w = case atVal "width" of + "" -> [] + d -> [("width", d)] + h = case atVal "depth" of + "" -> [] + d -> [("height", d)] + atr = (atVal "id", words $ atVal "role", w ++ h) + in return (atVal "fileref", atr) + let getCaption el = case filterChild (\x -> named "caption" x + || named "textobject" x + || named "alt" x) el of + Nothing -> return mempty + Just z -> mconcat <$> (mapM parseInline $ elContent z) + figTitle <- gets dbFigureTitle + let (caption, title) = if isNull figTitle + then (getCaption e, "") + else (return figTitle, "fig:") + liftM (imageWith attr imageUrl title) caption + +getBlocks :: PandocMonad m => Element -> DB m Blocks +getBlocks e = mconcat <$> (mapM parseBlock $ elContent e) + + +parseBlock :: PandocMonad m => Content -> DB m Blocks +parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE +parseBlock (Text (CData _ s _)) = if all isSpace s + then return mempty + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ map toUpper x +parseBlock (Elem e) = + case qName (elName e) of + "toc" -> return mempty -- skip TOC, since in pandoc it's autogenerated + "index" -> return mempty -- skip index, since page numbers meaningless + "para" -> parseMixed para (elContent e) + "formalpara" -> do + tit <- case filterChild (named "title") e of + Just t -> (para . strong . (<> str ".")) <$> + getInlines t + Nothing -> return mempty + (tit <>) <$> parseMixed para (elContent e) + "simpara" -> parseMixed para (elContent e) + "ackno" -> parseMixed para (elContent e) + "epigraph" -> parseBlockquote + "blockquote" -> parseBlockquote + "attribution" -> return mempty + "titleabbrev" -> return mempty + "authorinitials" -> return mempty + "title" -> checkInMeta getTitle + "author" -> checkInMeta getAuthor + "authorgroup" -> checkInMeta getAuthorGroup + "releaseinfo" -> checkInMeta (getInlines e >>= addMeta "release") + "date" -> checkInMeta getDate + "bibliography" -> sect 0 + "bibliodiv" -> sect 1 + "biblioentry" -> parseMixed para (elContent e) + "bibliomixed" -> parseMixed para (elContent e) + "glosssee" -> para . (\ils -> text "See " <> ils <> str ".") + <$> getInlines e + "glossseealso" -> para . (\ils -> text "See also " <> ils <> str ".") + <$> getInlines e + "glossary" -> sect 0 + "glossdiv" -> definitionList <$> + mapM parseGlossEntry (filterChildren (named "glossentry") e) + "glosslist" -> definitionList <$> + mapM parseGlossEntry (filterChildren (named "glossentry") e) + "chapter" -> sect 0 + "appendix" -> sect 0 + "preface" -> sect 0 + "bridgehead" -> para . strong <$> getInlines e + "sect1" -> sect 1 + "sect2" -> sect 2 + "sect3" -> sect 3 + "sect4" -> sect 4 + "sect5" -> sect 5 + "section" -> gets dbSectionLevel >>= sect . (+1) + "refsect1" -> sect 1 + "refsect2" -> sect 2 + "refsect3" -> sect 3 + "refsection" -> gets dbSectionLevel >>= sect . (+1) + "important" -> blockQuote . (para (strong $ str "Important") <>) + <$> getBlocks e + "caution" -> blockQuote . (para (strong $ str "Caution") <>) + <$> getBlocks e + "note" -> blockQuote . (para (strong $ str "Note") <>) + <$> getBlocks e + "tip" -> blockQuote . (para (strong $ str "Tip") <>) + <$> getBlocks e + "warning" -> blockQuote . (para (strong $ str "Warning") <>) + <$> getBlocks e + "area" -> return mempty + "areaset" -> return mempty + "areaspec" -> return mempty + "qandadiv" -> gets dbSectionLevel >>= sect . (+1) + "question" -> addToStart (strong (str "Q:") <> str " ") <$> getBlocks e + "answer" -> addToStart (strong (str "A:") <> str " ") <$> getBlocks e + "abstract" -> blockQuote <$> getBlocks e + "calloutlist" -> bulletList <$> callouts + "itemizedlist" -> bulletList <$> listitems + "orderedlist" -> do + let listStyle = case attrValue "numeration" e of + "arabic" -> Decimal + "loweralpha" -> LowerAlpha + "upperalpha" -> UpperAlpha + "lowerroman" -> LowerRoman + "upperroman" -> UpperRoman + _ -> Decimal + let start = fromMaybe 1 $ + (attrValue "override" <$> filterElement (named "listitem") e) + >>= safeRead + orderedListWith (start,listStyle,DefaultDelim) + <$> listitems + "variablelist" -> definitionList <$> deflistitems + "figure" -> getFigure e + "mediaobject" -> para <$> getMediaobject e + "caption" -> return mempty + "info" -> metaBlock + "articleinfo" -> metaBlock + "sectioninfo" -> return mempty -- keywords & other metadata + "refsectioninfo" -> return mempty -- keywords & other metadata + "refsect1info" -> return mempty -- keywords & other metadata + "refsect2info" -> return mempty -- keywords & other metadata + "refsect3info" -> return mempty -- keywords & other metadata + "sect1info" -> return mempty -- keywords & other metadata + "sect2info" -> return mempty -- keywords & other metadata + "sect3info" -> return mempty -- keywords & other metadata + "sect4info" -> return mempty -- keywords & other metadata + "sect5info" -> return mempty -- keywords & other metadata + "chapterinfo" -> return mempty -- keywords & other metadata + "glossaryinfo" -> return mempty -- keywords & other metadata + "appendixinfo" -> return mempty -- keywords & other metadata + "bookinfo" -> metaBlock + "article" -> modify (\st -> st{ dbBook = False }) >> + getBlocks e + "book" -> modify (\st -> st{ dbBook = True }) >> getBlocks e + "table" -> parseTable + "informaltable" -> parseTable + "informalexample" -> divWith ("", ["informalexample"], []) <$> + getBlocks e + "linegroup" -> lineBlock <$> lineItems + "literallayout" -> codeBlockWithLang + "screen" -> codeBlockWithLang + "programlisting" -> codeBlockWithLang + "?xml" -> return mempty + _ -> getBlocks e + where parseMixed container conts = do + let (ils,rest) = break isBlockElement conts + ils' <- (trimInlines . mconcat) <$> mapM parseInline ils + let p = if ils' == mempty then mempty else container ils' + case rest of + [] -> return p + (r:rs) -> do + b <- parseBlock r + x <- parseMixed container rs + return $ p <> b <> x + codeBlockWithLang = do + let classes' = case attrValue "language" e of + "" -> [] + x -> [x] + return $ codeBlockWith (attrValue "id" e, classes', []) + $ trimNl $ strContentRecursive e + parseBlockquote = do + attrib <- case filterChild (named "attribution") e of + Nothing -> return mempty + Just z -> (para . (str "— " <>) . mconcat) + <$> (mapM parseInline $ elContent z) + contents <- getBlocks e + return $ blockQuote (contents <> attrib) + listitems = mapM getBlocks $ filterChildren (named "listitem") e + callouts = mapM getBlocks $ filterChildren (named "callout") e + deflistitems = mapM parseVarListEntry $ filterChildren + (named "varlistentry") e + parseVarListEntry e' = do + let terms = filterChildren (named "term") e' + let items = filterChildren (named "listitem") e' + terms' <- mapM getInlines terms + items' <- mapM getBlocks items + return (mconcat $ intersperse (str "; ") terms', items') + parseGlossEntry e' = do + let terms = filterChildren (named "glossterm") e' + let items = filterChildren (named "glossdef") e' + terms' <- mapM getInlines terms + items' <- mapM getBlocks items + return (mconcat $ intersperse (str "; ") terms', items') + getTitle = do + tit <- getInlines e + subtit <- case filterChild (named "subtitle") e of + Just s -> (text ": " <>) <$> + getInlines s + Nothing -> return mempty + addMeta "title" (tit <> subtit) + + getAuthor = (:[]) <$> getInlines e >>= addMeta "author" + getAuthorGroup = do + let terms = filterChildren (named "author") e + mapM getInlines terms >>= addMeta "author" + getDate = getInlines e >>= addMeta "date" + parseTable = do + let isCaption x = named "title" x || named "caption" x + caption <- case filterChild isCaption e of + Just t -> getInlines t + Nothing -> return mempty + let e' = fromMaybe e $ filterChild (named "tgroup") e + let isColspec x = named "colspec" x || named "col" x + let colspecs = case filterChild (named "colgroup") e' of + Just c -> filterChildren isColspec c + _ -> filterChildren isColspec e' + let isRow x = named "row" x || named "tr" x + headrows <- case filterChild (named "thead") e' of + Just h -> case filterChild isRow h of + Just x -> parseRow x + Nothing -> return [] + Nothing -> return [] + bodyrows <- case filterChild (named "tbody") e' of + Just b -> mapM parseRow + $ filterChildren isRow b + Nothing -> mapM parseRow + $ filterChildren isRow e' + let toAlignment c = case findAttr (unqual "align") c of + Just "left" -> AlignLeft + Just "right" -> AlignRight + Just "center" -> AlignCenter + _ -> AlignDefault + let toWidth c = case findAttr (unqual "colwidth") c of + Just w -> fromMaybe 0 + $ safeRead $ '0': filter (\x -> + (x >= '0' && x <= '9') + || x == '.') w + Nothing -> 0 :: Double + let numrows = case bodyrows of + [] -> 0 + xs -> maximum $ map length xs + let aligns = case colspecs of + [] -> replicate numrows AlignDefault + cs -> map toAlignment cs + let widths = case colspecs of + [] -> replicate numrows 0 + cs -> let ws = map toWidth cs + tot = sum ws + in if all (> 0) ws + then map (/ tot) ws + else replicate numrows 0 + let headrows' = if null headrows + then replicate numrows mempty + else headrows + return $ table caption (zip aligns widths) + headrows' bodyrows + isEntry x = named "entry" x || named "td" x || named "th" x + parseRow = mapM (parseMixed plain . elContent) . filterChildren isEntry + sect n = do isbook <- gets dbBook + let n' = if isbook || n == 0 then n + 1 else n + headerText <- case filterChild (named "title") e `mplus` + (filterChild (named "info") e >>= + filterChild (named "title")) of + Just t -> getInlines t + Nothing -> return mempty + modify $ \st -> st{ dbSectionLevel = n } + b <- getBlocks e + let ident = attrValue "id" e + modify $ \st -> st{ dbSectionLevel = n - 1 } + return $ headerWith (ident,[],[]) n' headerText <> b + lineItems = mapM getInlines $ filterChildren (named "line") e + metaBlock = acceptingMetadata (getBlocks e) >> return mempty + +getInlines :: PandocMonad m => Element -> DB m Inlines +getInlines e' = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e') + +strContentRecursive :: Element -> String +strContentRecursive = strContent . + (\e' -> e'{ elContent = map elementToStr $ elContent e' }) + +elementToStr :: Content -> Content +elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing +elementToStr x = x + +parseInline :: PandocMonad m => Content -> DB m Inlines +parseInline (Text (CData _ s _)) = return $ text s +parseInline (CRef ref) = + return $ maybe (text $ map toUpper ref) (text) $ lookupEntity ref +parseInline (Elem e) = + case qName (elName e) of + "equation" -> equation displayMath + "informalequation" -> equation displayMath + "inlineequation" -> equation math + "subscript" -> subscript <$> innerInlines + "superscript" -> superscript <$> innerInlines + "inlinemediaobject" -> getMediaobject e + "quote" -> do + qt <- gets dbQuoteType + let qt' = if qt == SingleQuote then DoubleQuote else SingleQuote + modify $ \st -> st{ dbQuoteType = qt' } + contents <- innerInlines + modify $ \st -> st{ dbQuoteType = qt } + return $ if qt == SingleQuote + then singleQuoted contents + else doubleQuoted contents + "simplelist" -> simpleList + "segmentedlist" -> segmentedList + "classname" -> codeWithLang + "code" -> codeWithLang + "filename" -> codeWithLang + "literal" -> codeWithLang + "computeroutput" -> codeWithLang + "prompt" -> codeWithLang + "parameter" -> codeWithLang + "option" -> codeWithLang + "optional" -> do x <- getInlines e + return $ str "[" <> x <> str "]" + "markup" -> codeWithLang + "wordasword" -> emph <$> innerInlines + "command" -> codeWithLang + "varname" -> codeWithLang + "function" -> codeWithLang + "type" -> codeWithLang + "symbol" -> codeWithLang + "constant" -> codeWithLang + "userinput" -> codeWithLang + "varargs" -> return $ code "(...)" + "keycap" -> return (str $ strContent e) + "keycombo" -> keycombo <$> (mapM parseInline $ elContent e) + "menuchoice" -> menuchoice <$> (mapM parseInline $ + filter isGuiMenu $ elContent e) + "xref" -> do + content <- dbContent <$> get + let linkend = attrValue "linkend" e + let title = case attrValue "endterm" e of + "" -> maybe "???" xrefTitleByElem + (findElementById linkend content) + endterm -> maybe "???" strContent + (findElementById endterm content) + return $ link ('#' : linkend) "" (text title) + "email" -> return $ link ("mailto:" ++ strContent e) "" + $ str $ strContent e + "uri" -> return $ link (strContent e) "" $ str $ strContent e + "ulink" -> link (attrValue "url" e) "" <$> innerInlines + "link" -> do + ils <- innerInlines + let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of + Just h -> h + _ -> ('#' : attrValue "linkend" e) + let ils' = if ils == mempty then str href else ils + let attr = (attrValue "id" e, words $ attrValue "role" e, []) + return $ linkWith attr href "" ils' + "foreignphrase" -> emph <$> innerInlines + "emphasis" -> case attrValue "role" e of + "bold" -> strong <$> innerInlines + "strong" -> strong <$> innerInlines + "strikethrough" -> strikeout <$> innerInlines + _ -> emph <$> innerInlines + "footnote" -> (note . mconcat) <$> (mapM parseBlock $ elContent e) + "title" -> return mempty + "affiliation" -> return mempty + -- Note: this isn't a real docbook tag; it's what we convert + -- <?asciidor-br?> to in handleInstructions, above. A kludge to + -- work around xml-light's inability to parse an instruction. + "br" -> return linebreak + _ -> innerInlines + where innerInlines = (trimInlines . mconcat) <$> + (mapM parseInline $ elContent e) + equation constructor = return $ mconcat $ + map (constructor . writeTeX) + $ rights + $ map (readMathML . showElement . everywhere (mkT removePrefix)) + $ filterChildren (\x -> qName (elName x) == "math" && + qPrefix (elName x) == Just "mml") e + removePrefix elname = elname { qPrefix = Nothing } + codeWithLang = do + let classes' = case attrValue "language" e of + "" -> [] + l -> [l] + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e + simpleList = (mconcat . intersperse (str "," <> space)) <$> mapM getInlines + (filterChildren (named "member") e) + segmentedList = do + tit <- maybe (return mempty) getInlines $ filterChild (named "title") e + segtits <- mapM getInlines $ filterChildren (named "segtitle") e + segitems <- mapM (mapM getInlines . filterChildren (named "seg")) + $ filterChildren (named "seglistitem") e + let toSeg = mconcat . zipWith (\x y -> strong (x <> str ":") <> space <> + y <> linebreak) segtits + let segs = mconcat $ map toSeg segitems + let tit' = if tit == mempty + then mempty + else strong tit <> linebreak + return $ linebreak <> tit' <> segs + keycombo = spanWith ("",["keycombo"],[]) . + mconcat . intersperse (str "+") + menuchoice = spanWith ("",["menuchoice"],[]) . + mconcat . intersperse (text " > ") + isGuiMenu (Elem x) = named "guimenu" x || named "guisubmenu" x || + named "guimenuitem" x + isGuiMenu _ = False + + findElementById idString content + = asum [filterElement (\x -> attrValue "id" x == idString) el | Elem el <- content] + + -- Use the 'xreflabel' attribute for getting the title of a xref link; + -- if there's no such attribute, employ some heuristics based on what + -- docbook-xsl does. + xrefTitleByElem el + | not (null xrefLabel) = xrefLabel + | otherwise = case qName (elName el) of + "chapter" -> descendantContent "title" el + "sect1" -> descendantContent "title" el + "sect2" -> descendantContent "title" el + "sect3" -> descendantContent "title" el + "sect4" -> descendantContent "title" el + "sect5" -> descendantContent "title" el + "cmdsynopsis" -> descendantContent "command" el + "funcsynopsis" -> descendantContent "function" el + _ -> qName (elName el) ++ "_title" + where + xrefLabel = attrValue "xreflabel" el + descendantContent name = maybe "???" strContent + . filterElementName (\n -> qName n == name) diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs new file mode 100644 index 000000000..8936a0403 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -0,0 +1,626 @@ +{-# LANGUAGE PatternGuards, OverloadedStrings, CPP #-} + +{- +Copyright (C) 2014-2016 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx + Copyright : Copyright (C) 2014-2016 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Conversion of Docx type (defined in Text.Pandoc.Readers.Docx.Parse) +to 'Pandoc' document. -} + +{- +Current state of implementation of Docx entities ([x] means +implemented, [-] means partially implemented): + +* Blocks + + - [X] Para + - [X] CodeBlock (styled with `SourceCode`) + - [X] BlockQuote (styled with `Quote`, `BlockQuote`, or, optionally, + indented) + - [X] OrderedList + - [X] BulletList + - [X] DefinitionList (styled with adjacent `DefinitionTerm` and `Definition`) + - [X] Header (styled with `Heading#`) + - [ ] HorizontalRule + - [-] Table (column widths and alignments not yet implemented) + +* Inlines + + - [X] Str + - [X] Emph (italics and underline both read as Emph) + - [X] Strong + - [X] Strikeout + - [X] Superscript + - [X] Subscript + - [X] SmallCaps + - [ ] Quoted + - [ ] Cite + - [X] Code (styled with `VerbatimChar`) + - [X] Space + - [X] LineBreak (these are invisible in Word: entered with Shift-Return) + - [X] Math + - [X] Link (links to an arbitrary bookmark create a span with the target as + id and "anchor" class) + - [X] Image + - [X] Note (Footnotes and Endnotes are silently combined.) +-} + +module Text.Pandoc.Readers.Docx + ( readDocxWithWarnings + , readDocx + ) where + +import Codec.Archive.Zip +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Builder +import Text.Pandoc.Walk +import Text.Pandoc.Readers.Docx.Parse +import Text.Pandoc.Readers.Docx.Lists +import Text.Pandoc.Readers.Docx.Combine +import Text.Pandoc.Shared +import Text.Pandoc.MediaBag (MediaBag) +import Data.List (delete, intersect) +import Text.TeXMath (writeTeX) +import Data.Default (Default) +import qualified Data.ByteString.Lazy as B +import qualified Data.Map as M +import qualified Data.Set as Set +import Control.Monad.Reader +import Control.Monad.State +import Data.Sequence (ViewL(..), viewl) +import qualified Data.Sequence as Seq (null) +#if !(MIN_VERSION_base(4,8,0)) +import Data.Traversable (traverse) +#endif +import Text.Pandoc.Error +import Control.Monad.Except (throwError) +import Text.Pandoc.Class (PandocMonad) +import qualified Text.Pandoc.Class as P +import Text.Pandoc.Logging + +readDocx :: PandocMonad m + => ReaderOptions + -> B.ByteString + -> m Pandoc +readDocx opts bytes + | Right archive <- toArchiveOrFail bytes + , Right (docx, parserWarnings) <- archiveToDocxWithWarnings archive = do + mapM_ (P.report . DocxParserWarning) parserWarnings + (meta, blks) <- docxToOutput opts docx + return $ Pandoc meta blks +readDocx _ _ = + throwError $ PandocSomeError "couldn't parse docx file" + +-- TODO remove this for 2.0: +readDocxWithWarnings :: PandocMonad m + => ReaderOptions + -> B.ByteString + -> m Pandoc +readDocxWithWarnings = readDocx + +data DState = DState { docxAnchorMap :: M.Map String String + , docxMediaBag :: MediaBag + , docxDropCap :: Inlines + , docxWarnings :: [String] + } + +instance Default DState where + def = DState { docxAnchorMap = M.empty + , docxMediaBag = mempty + , docxDropCap = mempty + , docxWarnings = [] + } + +data DEnv = DEnv { docxOptions :: ReaderOptions + , docxInHeaderBlock :: Bool } + +instance Default DEnv where + def = DEnv def False + +type DocxContext m = ReaderT DEnv (StateT DState m) + +evalDocxContext :: PandocMonad m => DocxContext m a -> DEnv -> DState -> m a +evalDocxContext ctx env st = flip evalStateT st $ flip runReaderT env $ ctx + +-- This is empty, but we put it in for future-proofing. +spansToKeep :: [String] +spansToKeep = [] + +divsToKeep :: [String] +divsToKeep = ["list-item", "Definition", "DefinitionTerm"] + +metaStyles :: M.Map String String +metaStyles = M.fromList [ ("Title", "title") + , ("Subtitle", "subtitle") + , ("Author", "author") + , ("Date", "date") + , ("Abstract", "abstract")] + +sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart]) +sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp)) + +isMetaPar :: BodyPart -> Bool +isMetaPar (Paragraph pPr _) = + not $ null $ intersect (pStyle pPr) (M.keys metaStyles) +isMetaPar _ = False + +isEmptyPar :: BodyPart -> Bool +isEmptyPar (Paragraph _ parParts) = + all isEmptyParPart parParts + where + isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems + isEmptyParPart _ = False + isEmptyElem (TextRun s) = trim s == "" + isEmptyElem _ = True +isEmptyPar _ = False + +bodyPartsToMeta' :: PandocMonad m => [BodyPart] -> DocxContext m (M.Map String MetaValue) +bodyPartsToMeta' [] = return M.empty +bodyPartsToMeta' (bp : bps) + | (Paragraph pPr parParts) <- bp + , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles) + , (Just metaField) <- M.lookup c metaStyles = do + inlines <- smushInlines <$> mapM parPartToInlines parParts + remaining <- bodyPartsToMeta' bps + let + f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils'] + f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks) + f m (MetaList mv) = MetaList (m : mv) + f m n = MetaList [m, n] + return $ M.insertWith f metaField (MetaInlines (toList inlines)) remaining +bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps + +bodyPartsToMeta :: PandocMonad m => [BodyPart] -> DocxContext m Meta +bodyPartsToMeta bps = do + mp <- bodyPartsToMeta' bps + let mp' = + case M.lookup "author" mp of + Just mv -> M.insert "author" (fixAuthors mv) mp + Nothing -> mp + return $ Meta mp' + +fixAuthors :: MetaValue -> MetaValue +fixAuthors (MetaBlocks blks) = + MetaList $ map g $ filter f blks + where f (Para _) = True + f _ = False + g (Para ils) = MetaInlines ils + g _ = MetaInlines [] +fixAuthors mv = mv + +codeStyles :: [String] +codeStyles = ["VerbatimChar"] + +codeDivs :: [String] +codeDivs = ["SourceCode"] + +runElemToInlines :: RunElem -> Inlines +runElemToInlines (TextRun s) = text s +runElemToInlines (LnBrk) = linebreak +runElemToInlines (Tab) = space +runElemToInlines (SoftHyphen) = text "\xad" +runElemToInlines (NoBreakHyphen) = text "\x2011" + +runElemToString :: RunElem -> String +runElemToString (TextRun s) = s +runElemToString (LnBrk) = ['\n'] +runElemToString (Tab) = ['\t'] +runElemToString (SoftHyphen) = ['\xad'] +runElemToString (NoBreakHyphen) = ['\x2011'] + +runToString :: Run -> String +runToString (Run _ runElems) = concatMap runElemToString runElems +runToString _ = "" + +parPartToString :: ParPart -> String +parPartToString (PlainRun run) = runToString run +parPartToString (InternalHyperLink _ runs) = concatMap runToString runs +parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs +parPartToString _ = "" + +blacklistedCharStyles :: [String] +blacklistedCharStyles = ["Hyperlink"] + +resolveDependentRunStyle :: RunStyle -> RunStyle +resolveDependentRunStyle rPr + | Just (s, _) <- rStyle rPr, s `elem` blacklistedCharStyles = + rPr + | Just (_, cs) <- rStyle rPr = + let rPr' = resolveDependentRunStyle cs + in + RunStyle { isBold = case isBold rPr of + Just bool -> Just bool + Nothing -> isBold rPr' + , isItalic = case isItalic rPr of + Just bool -> Just bool + Nothing -> isItalic rPr' + , isSmallCaps = case isSmallCaps rPr of + Just bool -> Just bool + Nothing -> isSmallCaps rPr' + , isStrike = case isStrike rPr of + Just bool -> Just bool + Nothing -> isStrike rPr' + , rVertAlign = case rVertAlign rPr of + Just valign -> Just valign + Nothing -> rVertAlign rPr' + , rUnderline = case rUnderline rPr of + Just ulstyle -> Just ulstyle + Nothing -> rUnderline rPr' + , rStyle = rStyle rPr } + | otherwise = rPr + +runStyleToTransform :: RunStyle -> (Inlines -> Inlines) +runStyleToTransform rPr + | Just (s, _) <- rStyle rPr + , s `elem` spansToKeep = + let rPr' = rPr{rStyle = Nothing} + in + (spanWith ("", [s], [])) . (runStyleToTransform rPr') + | Just True <- isItalic rPr = + emph . (runStyleToTransform rPr {isItalic = Nothing}) + | Just True <- isBold rPr = + strong . (runStyleToTransform rPr {isBold = Nothing}) + | Just True <- isSmallCaps rPr = + smallcaps . (runStyleToTransform rPr {isSmallCaps = Nothing}) + | Just True <- isStrike rPr = + strikeout . (runStyleToTransform rPr {isStrike = Nothing}) + | Just SupScrpt <- rVertAlign rPr = + superscript . (runStyleToTransform rPr {rVertAlign = Nothing}) + | Just SubScrpt <- rVertAlign rPr = + subscript . (runStyleToTransform rPr {rVertAlign = Nothing}) + | Just "single" <- rUnderline rPr = + emph . (runStyleToTransform rPr {rUnderline = Nothing}) + | otherwise = id + +runToInlines :: PandocMonad m => Run -> DocxContext m Inlines +runToInlines (Run rs runElems) + | Just (s, _) <- rStyle rs + , s `elem` codeStyles = + let rPr = resolveDependentRunStyle rs + codeString = code $ concatMap runElemToString runElems + in + return $ case rVertAlign rPr of + Just SupScrpt -> superscript codeString + Just SubScrpt -> subscript codeString + _ -> codeString + | otherwise = do + let ils = smushInlines (map runElemToInlines runElems) + return $ (runStyleToTransform $ resolveDependentRunStyle rs) ils +runToInlines (Footnote bps) = do + blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps) + return $ note blksList +runToInlines (Endnote bps) = do + blksList <- smushBlocks <$> (mapM bodyPartToBlocks bps) + return $ note blksList +runToInlines (InlineDrawing fp title alt bs ext) = do + (lift . lift) $ P.insertMedia fp Nothing bs + return $ imageWith (extentToAttr ext) fp title $ text alt +runToInlines InlineChart = return $ spanWith ("", ["chart"], []) $ text "[CHART]" + +extentToAttr :: Extent -> Attr +extentToAttr (Just (w, h)) = + ("", [], [("width", showDim w), ("height", showDim h)] ) + where + showDim d = show (d / 914400) ++ "in" +extentToAttr _ = nullAttr + +blocksToInlinesWarn :: PandocMonad m => String -> Blocks -> DocxContext m Inlines +blocksToInlinesWarn cmtId blks = do + let blkList = toList blks + notParaOrPlain :: Block -> Bool + notParaOrPlain (Para _) = False + notParaOrPlain (Plain _) = False + notParaOrPlain _ = True + when (not $ null $ filter notParaOrPlain blkList) $ + lift $ P.report $ DocxParserWarning $ + "Docx comment " ++ cmtId ++ " will not retain formatting" + return $ fromList $ blocksToInlines blkList + +parPartToInlines :: PandocMonad m => ParPart -> DocxContext m Inlines +parPartToInlines (PlainRun r) = runToInlines r +parPartToInlines (Insertion _ author date runs) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AcceptChanges -> smushInlines <$> mapM runToInlines runs + RejectChanges -> return mempty + AllChanges -> do + ils <- smushInlines <$> mapM runToInlines runs + let attr = ("", ["insertion"], [("author", author), ("date", date)]) + return $ spanWith attr ils +parPartToInlines (Deletion _ author date runs) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AcceptChanges -> return mempty + RejectChanges -> smushInlines <$> mapM runToInlines runs + AllChanges -> do + ils <- smushInlines <$> mapM runToInlines runs + let attr = ("", ["deletion"], [("author", author), ("date", date)]) + return $ spanWith attr ils +parPartToInlines (CommentStart cmtId author date bodyParts) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AllChanges -> do + blks <- smushBlocks <$> mapM bodyPartToBlocks bodyParts + ils <- blocksToInlinesWarn cmtId blks + let attr = ("", ["comment-start"], [("id", cmtId), ("author", author), ("date", date)]) + return $ spanWith attr ils + _ -> return mempty +parPartToInlines (CommentEnd cmtId) = do + opts <- asks docxOptions + case readerTrackChanges opts of + AllChanges -> do + let attr = ("", ["comment-end"], [("id", cmtId)]) + return $ spanWith attr mempty + _ -> return mempty +parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = + return mempty +parPartToInlines (BookMark _ anchor) = + -- We record these, so we can make sure not to overwrite + -- user-defined anchor links with header auto ids. + do + -- get whether we're in a header. + inHdrBool <- asks docxInHeaderBlock + -- Get the anchor map. + anchorMap <- gets docxAnchorMap + -- We don't want to rewrite if we're in a header, since we'll take + -- care of that later, when we make the header anchor. If the + -- bookmark were already in uniqueIdent form, this would lead to a + -- duplication. Otherwise, we check to see if the id is already in + -- there. Rewrite if necessary. This will have the possible effect + -- of rewriting user-defined anchor links. However, since these + -- are not defined in pandoc, it seems like a necessary evil to + -- avoid an extra pass. + let newAnchor = + if not inHdrBool && anchor `elem` (M.elems anchorMap) + then uniqueIdent [Str anchor] (Set.fromList $ M.elems anchorMap) + else anchor + unless inHdrBool + (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap}) + return $ spanWith (newAnchor, ["anchor"], []) mempty +parPartToInlines (Drawing fp title alt bs ext) = do + (lift . lift) $ P.insertMedia fp Nothing bs + return $ imageWith (extentToAttr ext) fp title $ text alt +parPartToInlines Chart = do + return $ spanWith ("", ["chart"], []) $ text "[CHART]" +parPartToInlines (InternalHyperLink anchor runs) = do + ils <- smushInlines <$> mapM runToInlines runs + return $ link ('#' : anchor) "" ils +parPartToInlines (ExternalHyperLink target runs) = do + ils <- smushInlines <$> mapM runToInlines runs + return $ link target "" ils +parPartToInlines (PlainOMath exps) = do + return $ math $ writeTeX exps +parPartToInlines (SmartTag runs) = do + ils <- smushInlines <$> mapM runToInlines runs + return ils + +isAnchorSpan :: Inline -> Bool +isAnchorSpan (Span (_, classes, kvs) _) = + classes == ["anchor"] && + null kvs +isAnchorSpan _ = False + +dummyAnchors :: [String] +dummyAnchors = ["_GoBack"] + +makeHeaderAnchor :: PandocMonad m => Blocks -> DocxContext m Blocks +makeHeaderAnchor bs = traverse makeHeaderAnchor' bs + +makeHeaderAnchor' :: PandocMonad m => Block -> DocxContext m Block +-- If there is an anchor already there (an anchor span in the header, +-- to be exact), we rename and associate the new id with the old one. +makeHeaderAnchor' (Header n (ident, classes, kvs) ils) + | (c:_) <- filter isAnchorSpan ils + , (Span (anchIdent, ["anchor"], _) cIls) <- c = do + hdrIDMap <- gets docxAnchorMap + let newIdent = if null ident + then uniqueIdent ils (Set.fromList $ M.elems hdrIDMap) + else ident + newIls = concatMap f ils where f il | il == c = cIls + | otherwise = [il] + modify $ \s -> s {docxAnchorMap = M.insert anchIdent newIdent hdrIDMap} + makeHeaderAnchor' $ Header n (newIdent, classes, kvs) newIls +-- Otherwise we just give it a name, and register that name (associate +-- it with itself.) +makeHeaderAnchor' (Header n (ident, classes, kvs) ils) = + do + hdrIDMap <- gets docxAnchorMap + let newIdent = if null ident + then uniqueIdent ils (Set.fromList $ M.elems hdrIDMap) + else ident + modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) ils +makeHeaderAnchor' blk = return blk + +-- Rewrite a standalone paragraph block as a plain +singleParaToPlain :: Blocks -> Blocks +singleParaToPlain blks + | (Para (ils) :< seeq) <- viewl $ unMany blks + , Seq.null seeq = + singleton $ Plain ils +singleParaToPlain blks = blks + +cellToBlocks :: PandocMonad m => Cell -> DocxContext m Blocks +cellToBlocks (Cell bps) = do + blks <- smushBlocks <$> mapM bodyPartToBlocks bps + return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks + +rowToBlocksList :: PandocMonad m => Row -> DocxContext m [Blocks] +rowToBlocksList (Row cells) = do + blksList <- mapM cellToBlocks cells + return $ map singleParaToPlain blksList + +trimLineBreaks :: [Inline] -> [Inline] +trimLineBreaks [] = [] +trimLineBreaks (LineBreak : ils) = trimLineBreaks ils +trimLineBreaks ils + | (LineBreak : ils') <- reverse ils = trimLineBreaks (reverse ils') +trimLineBreaks ils = ils + +parStyleToTransform :: ParagraphStyle -> (Blocks -> Blocks) +parStyleToTransform pPr + | (c:cs) <- pStyle pPr + , c `elem` divsToKeep = + let pPr' = pPr { pStyle = cs } + in + (divWith ("", [c], [])) . (parStyleToTransform pPr') + | (c:cs) <- pStyle pPr, + c `elem` listParagraphDivs = + let pPr' = pPr { pStyle = cs, indentation = Nothing} + in + (divWith ("", [c], [])) . (parStyleToTransform pPr') + | (_:cs) <- pStyle pPr + , Just True <- pBlockQuote pPr = + let pPr' = pPr { pStyle = cs } + in + blockQuote . (parStyleToTransform pPr') + | (_:cs) <- pStyle pPr = + let pPr' = pPr { pStyle = cs} + in + parStyleToTransform pPr' + | null (pStyle pPr) + , Just left <- indentation pPr >>= leftParIndent + , Just hang <- indentation pPr >>= hangingParIndent = + let pPr' = pPr { indentation = Nothing } + in + case (left - hang) > 0 of + True -> blockQuote . (parStyleToTransform pPr') + False -> parStyleToTransform pPr' + | null (pStyle pPr), + Just left <- indentation pPr >>= leftParIndent = + let pPr' = pPr { indentation = Nothing } + in + case left > 0 of + True -> blockQuote . (parStyleToTransform pPr') + False -> parStyleToTransform pPr' +parStyleToTransform _ = id + +bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks +bodyPartToBlocks (Paragraph pPr parparts) + | not $ null $ codeDivs `intersect` (pStyle pPr) = + return + $ parStyleToTransform pPr + $ codeBlock + $ concatMap parPartToString parparts + | Just (style, n) <- pHeading pPr = do + ils <- local (\s-> s{docxInHeaderBlock=True}) $ + (smushInlines <$> mapM parPartToInlines parparts) + makeHeaderAnchor $ + headerWith ("", delete style (pStyle pPr), []) n ils + | otherwise = do + ils <- smushInlines <$> mapM parPartToInlines parparts >>= + (return . fromList . trimLineBreaks . normalizeSpaces . toList) + dropIls <- gets docxDropCap + let ils' = dropIls <> ils + if dropCap pPr + then do modify $ \s -> s { docxDropCap = ils' } + return mempty + else do modify $ \s -> s { docxDropCap = mempty } + return $ case isNull ils' of + True -> mempty + _ -> parStyleToTransform pPr $ para ils' +bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do + let + kvs = case levelInfo of + (_, fmt, txt, Just start) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + , ("start", (show start)) + ] + + (_, fmt, txt, Nothing) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + ] + blks <- bodyPartToBlocks (Paragraph pPr parparts) + return $ divWith ("", ["list-item"], kvs) blks +bodyPartToBlocks (ListItem pPr _ _ _ parparts) = + let pPr' = pPr {pStyle = "ListParagraph": (pStyle pPr)} + in + bodyPartToBlocks $ Paragraph pPr' parparts +bodyPartToBlocks (Tbl _ _ _ []) = + return $ para mempty +bodyPartToBlocks (Tbl cap _ look (r:rs)) = do + let caption = text cap + (hdr, rows) = case firstRowFormatting look of + True | null rs -> (Nothing, [r]) + | otherwise -> (Just r, rs) + False -> (Nothing, r:rs) + + cells <- mapM rowToBlocksList rows + + let width = case cells of + r':_ -> length r' + -- shouldn't happen + [] -> 0 + + hdrCells <- case hdr of + Just r' -> rowToBlocksList r' + Nothing -> return $ replicate width mempty + + -- The two following variables (horizontal column alignment and + -- relative column widths) go to the default at the + -- moment. Width information is in the TblGrid field of the Tbl, + -- so should be possible. Alignment might be more difficult, + -- since there doesn't seem to be a column entity in docx. + let alignments = replicate width AlignDefault + widths = replicate width 0 :: [Double] + + return $ table caption (zip alignments widths) hdrCells cells +bodyPartToBlocks (OMathPara e) = do + return $ para $ displayMath (writeTeX e) + + +-- replace targets with generated anchors. +rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline +rewriteLink' l@(Link attr ils ('#':target, title)) = do + anchorMap <- gets docxAnchorMap + return $ case M.lookup target anchorMap of + Just newTarget -> (Link attr ils ('#':newTarget, title)) + Nothing -> l +rewriteLink' il = return il + +rewriteLinks :: PandocMonad m => [Block] -> DocxContext m [Block] +rewriteLinks = mapM (walkM rewriteLink') + +bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block]) +bodyToOutput (Body bps) = do + let (metabps, blkbps) = sepBodyParts bps + meta <- bodyPartsToMeta metabps + blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps + blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks + return $ (meta, blks') + +docxToOutput :: PandocMonad m + => ReaderOptions + -> Docx + -> m (Meta, [Block]) +docxToOutput opts (Docx (Document _ body)) = + let dEnv = def { docxOptions = opts} in + evalDocxContext (bodyToOutput body) dEnv def diff --git a/src/Text/Pandoc/Readers/Docx/Combine.hs b/src/Text/Pandoc/Readers/Docx/Combine.hs new file mode 100644 index 000000000..39e0df825 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Combine.hs @@ -0,0 +1,154 @@ +{-# LANGUAGE TypeSynonymInstances, FlexibleInstances, + PatternGuards #-} + +module Text.Pandoc.Readers.Docx.Combine ( smushInlines + , smushBlocks + ) + where + +import Text.Pandoc.Builder +import Data.List +import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr, (><), (|>)) +import qualified Data.Sequence as Seq (null) + +data Modifier a = Modifier (a -> a) + | AttrModifier (Attr -> a -> a) Attr + | NullModifier + +spaceOutInlinesL :: Inlines -> (Inlines, Inlines) +spaceOutInlinesL ms = (l, stackInlines fs (m' <> r)) + where (l, m, r) = spaceOutInlines ms + (fs, m') = unstackInlines m + +spaceOutInlinesR :: Inlines -> (Inlines, Inlines) +spaceOutInlinesR ms = (stackInlines fs (l <> m'), r) + where (l, m, r) = spaceOutInlines ms + (fs, m') = unstackInlines m + +spaceOutInlines :: Inlines -> (Inlines, Inlines, Inlines) +spaceOutInlines ils = + let (fs, ils') = unstackInlines ils + contents = unMany ils' + left = case viewl contents of + (Space :< _) -> space + _ -> mempty + right = case viewr contents of + (_ :> Space) -> space + _ -> mempty in + (left, (stackInlines fs $ trimInlines . Many $ contents), right) + +stackInlines :: [Modifier Inlines] -> Inlines -> Inlines +stackInlines [] ms = ms +stackInlines (NullModifier : fs) ms = stackInlines fs ms +stackInlines ((Modifier f) : fs) ms = + if isEmpty ms + then stackInlines fs ms + else f $ stackInlines fs ms +stackInlines ((AttrModifier f attr) : fs) ms = f attr $ stackInlines fs ms + +unstackInlines :: Inlines -> ([Modifier Inlines], Inlines) +unstackInlines ms = case ilModifier ms of + NullModifier -> ([], ms) + _ -> (f : fs, ms') where + f = ilModifier ms + (fs, ms') = unstackInlines $ ilInnards ms + +ilModifier :: Inlines -> Modifier Inlines +ilModifier ils = case viewl (unMany ils) of + (x :< xs) | Seq.null xs -> case x of + (Emph _) -> Modifier emph + (Strong _) -> Modifier strong + (SmallCaps _) -> Modifier smallcaps + (Strikeout _) -> Modifier strikeout + (Superscript _) -> Modifier superscript + (Subscript _) -> Modifier subscript + (Link attr _ tgt) -> Modifier $ linkWith attr (fst tgt) (snd tgt) + (Span attr _) -> AttrModifier spanWith attr + _ -> NullModifier + _ -> NullModifier + +ilInnards :: Inlines -> Inlines +ilInnards ils = case viewl (unMany ils) of + (x :< xs) | Seq.null xs -> case x of + (Emph lst) -> fromList lst + (Strong lst) -> fromList lst + (SmallCaps lst) -> fromList lst + (Strikeout lst) -> fromList lst + (Superscript lst) -> fromList lst + (Subscript lst) -> fromList lst + (Link _ lst _) -> fromList lst + (Span _ lst) -> fromList lst + _ -> ils + _ -> ils + +inlinesL :: Inlines -> (Inlines, Inlines) +inlinesL ils = case viewl $ unMany ils of + (s :< sq) -> (singleton s, Many sq) + _ -> (mempty, ils) + +inlinesR :: Inlines -> (Inlines, Inlines) +inlinesR ils = case viewr $ unMany ils of + (sq :> s) -> (Many sq, singleton s) + _ -> (ils, mempty) + +combineInlines :: Inlines -> Inlines -> Inlines +combineInlines x y = + let (xs', x') = inlinesR x + (y', ys') = inlinesL y + in + xs' <> (combineSingletonInlines x' y') <> ys' + +combineSingletonInlines :: Inlines -> Inlines -> Inlines +combineSingletonInlines x y = + let (xfs, xs) = unstackInlines x + (yfs, ys) = unstackInlines y + shared = xfs `intersect` yfs + x_remaining = xfs \\ shared + y_remaining = yfs \\ shared + x_rem_attr = filter isAttrModifier x_remaining + y_rem_attr = filter isAttrModifier y_remaining + in + case null shared of + True | isEmpty xs && isEmpty ys -> + stackInlines (x_rem_attr ++ y_rem_attr) mempty + | isEmpty xs -> + let (sp, y') = spaceOutInlinesL y in + (stackInlines x_rem_attr mempty) <> sp <> y' + | isEmpty ys -> + let (x', sp) = spaceOutInlinesR x in + x' <> sp <> (stackInlines y_rem_attr mempty) + | otherwise -> + let (x', xsp) = spaceOutInlinesR x + (ysp, y') = spaceOutInlinesL y + in + x' <> xsp <> ysp <> y' + False -> stackInlines shared $ + combineInlines + (stackInlines x_remaining xs) + (stackInlines y_remaining ys) + +combineBlocks :: Blocks -> Blocks -> Blocks +combineBlocks bs cs + | bs' :> (BlockQuote bs'') <- viewr (unMany bs) + , (BlockQuote cs'') :< cs' <- viewl (unMany cs) = + Many $ (bs' |> (BlockQuote (bs'' <> cs''))) >< cs' +combineBlocks bs cs = bs <> cs + +instance (Monoid a, Eq a) => Eq (Modifier a) where + (Modifier f) == (Modifier g) = (f mempty == g mempty) + (AttrModifier f attr) == (AttrModifier g attr') = (f attr mempty == g attr' mempty) + (NullModifier) == (NullModifier) = True + _ == _ = False + +isEmpty :: (Monoid a, Eq a) => a -> Bool +isEmpty x = x == mempty + +isAttrModifier :: Modifier a -> Bool +isAttrModifier (AttrModifier _ _) = True +isAttrModifier _ = False + +smushInlines :: [Inlines] -> Inlines +smushInlines xs = foldl combineInlines mempty xs + +smushBlocks :: [Blocks] -> Blocks +smushBlocks xs = foldl combineBlocks mempty xs diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs new file mode 100644 index 000000000..395a53907 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Lists.hs @@ -0,0 +1,229 @@ +{- +Copyright (C) 2014-2016 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Lists + Copyright : Copyright (C) 2014-2016 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Functions for converting flat docx paragraphs into nested lists. +-} + +module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets + , blocksToDefinitions + , listParagraphDivs + ) where + +import Text.Pandoc.JSON +import Text.Pandoc.Generic (bottomUp) +import Text.Pandoc.Shared (trim) +import Control.Monad +import Data.List +import Data.Maybe + +isListItem :: Block -> Bool +isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True +isListItem _ = False + +getLevel :: Block -> Maybe Integer +getLevel (Div (_, _, kvs) _) = liftM read $ lookup "level" kvs +getLevel _ = Nothing + +getLevelN :: Block -> Integer +getLevelN b = case getLevel b of + Just n -> n + Nothing -> -1 + +getNumId :: Block -> Maybe Integer +getNumId (Div (_, _, kvs) _) = liftM read $ lookup "num-id" kvs +getNumId _ = Nothing + +getNumIdN :: Block -> Integer +getNumIdN b = case getNumId b of + Just n -> n + Nothing -> -1 + +getText :: Block -> Maybe String +getText (Div (_, _, kvs) _) = lookup "text" kvs +getText _ = Nothing + +data ListType = Itemized | Enumerated ListAttributes + +listStyleMap :: [(String, ListNumberStyle)] +listStyleMap = [("upperLetter", UpperAlpha), + ("lowerLetter", LowerAlpha), + ("upperRoman", UpperRoman), + ("lowerRoman", LowerRoman), + ("decimal", Decimal)] + +listDelimMap :: [(String, ListNumberDelim)] +listDelimMap = [("%1)", OneParen), + ("(%1)", TwoParens), + ("%1.", Period)] + +getListType :: Block -> Maybe ListType +getListType b@(Div (_, _, kvs) _) | isListItem b = + let + start = lookup "start" kvs + frmt = lookup "format" kvs + txt = lookup "text" kvs + in + case frmt of + Just "bullet" -> Just Itemized + Just f -> + case txt of + Just t -> Just $ Enumerated ( + read (fromMaybe "1" start) :: Int, + fromMaybe DefaultStyle (lookup f listStyleMap), + fromMaybe DefaultDelim (lookup t listDelimMap)) + Nothing -> Nothing + _ -> Nothing +getListType _ = Nothing + +listParagraphDivs :: [String] +listParagraphDivs = ["ListParagraph"] + +-- This is a first stab at going through and attaching meaning to list +-- paragraphs, without an item marker, following a list item. We +-- assume that these are paragraphs in the same item. + +handleListParagraphs :: [Block] -> [Block] +handleListParagraphs [] = [] +handleListParagraphs ( + (Div attr1@(_, classes1, _) blks1) : + (Div (ident2, classes2, kvs2) blks2) : + blks + ) | "list-item" `elem` classes1 && + not ("list-item" `elem` classes2) && + (not . null) (listParagraphDivs `intersect` classes2) = + -- We don't want to keep this indent. + let newDiv2 = + (Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2) + in + handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks) +handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks) + +separateBlocks' :: Block -> [[Block]] -> [[Block]] +separateBlocks' blk ([] : []) = [[blk]] +separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b@(OrderedList _ _) acc = (init acc) ++ [(last acc) ++ [b]] +-- The following is for the invisible bullet lists. This is how +-- pandoc-generated ooxml does multiparagraph item lists. +separateBlocks' b acc | liftM trim (getText b) == Just "" = + (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b acc = acc ++ [[b]] + +separateBlocks :: [Block] -> [[Block]] +separateBlocks blks = foldr separateBlocks' [[]] (reverse blks) + +flatToBullets' :: Integer -> [Block] -> [Block] +flatToBullets' _ [] = [] +flatToBullets' num xs@(b : elems) + | getLevelN b == num = b : (flatToBullets' num elems) + | otherwise = + let bNumId = getNumIdN b + bLevel = getLevelN b + (children, remaining) = + span + (\b' -> + ((getLevelN b') > bLevel || + ((getLevelN b') == bLevel && (getNumIdN b') == bNumId))) + xs + in + case getListType b of + Just (Enumerated attr) -> + (OrderedList attr (separateBlocks $ flatToBullets' bLevel children)) : + (flatToBullets' num remaining) + _ -> + (BulletList (separateBlocks $ flatToBullets' bLevel children)) : + (flatToBullets' num remaining) + +flatToBullets :: [Block] -> [Block] +flatToBullets elems = flatToBullets' (-1) elems + +singleItemHeaderToHeader :: Block -> Block +singleItemHeaderToHeader (OrderedList _ [[h@(Header _ _ _)]]) = h +singleItemHeaderToHeader blk = blk + + +blocksToBullets :: [Block] -> [Block] +blocksToBullets blks = + map singleItemHeaderToHeader $ + bottomUp removeListDivs $ + flatToBullets $ (handleListParagraphs blks) + +plainParaInlines :: Block -> [Inline] +plainParaInlines (Plain ils) = ils +plainParaInlines (Para ils) = ils +plainParaInlines _ = [] + +blocksToDefinitions' :: [([Inline], [[Block]])] -> [Block] -> [Block] -> [Block] +blocksToDefinitions' [] acc [] = reverse acc +blocksToDefinitions' defAcc acc [] = + reverse $ (DefinitionList (reverse defAcc)) : acc +blocksToDefinitions' defAcc acc + ((Div (_, classes1, _) blks1) : (Div (ident2, classes2, kvs2) blks2) : blks) + | "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 = + let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) + pair = case remainingAttr2 == ("", [], []) of + True -> (concatMap plainParaInlines blks1, [blks2]) + False -> (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]]) + in + blocksToDefinitions' (pair : defAcc) acc blks +blocksToDefinitions' defAcc acc + ((Div (ident2, classes2, kvs2) blks2) : blks) + | (not . null) defAcc && "Definition" `elem` classes2 = + let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) + defItems2 = case remainingAttr2 == ("", [], []) of + True -> blks2 + False -> [Div remainingAttr2 blks2] + ((defTerm, defItems):defs) = defAcc + defAcc' = case null defItems of + True -> (defTerm, [defItems2]) : defs + False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs + in + blocksToDefinitions' defAcc' acc blks +blocksToDefinitions' [] acc (b:blks) = + blocksToDefinitions' [] (b:acc) blks +blocksToDefinitions' defAcc acc (b:blks) = + blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks + +removeListDivs' :: Block -> [Block] +removeListDivs' (Div (ident, classes, kvs) blks) + | "list-item" `elem` classes = + case delete "list-item" classes of + [] -> blks + classes' -> [Div (ident, classes', kvs) $ blks] +removeListDivs' (Div (ident, classes, kvs) blks) + | not $ null $ listParagraphDivs `intersect` classes = + case classes \\ listParagraphDivs of + [] -> blks + classes' -> [Div (ident, classes', kvs) blks] +removeListDivs' blk = [blk] + +removeListDivs :: [Block] -> [Block] +removeListDivs = concatMap removeListDivs' + + + +blocksToDefinitions :: [Block] -> [Block] +blocksToDefinitions = blocksToDefinitions' [] [] diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs new file mode 100644 index 000000000..221a1d10a --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -0,0 +1,1044 @@ +{-# LANGUAGE PatternGuards, ViewPatterns, FlexibleInstances #-} + +{- +Copyright (C) 2014-2016 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Parse + Copyright : Copyright (C) 2014-2016 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> + Stability : alpha + Portability : portable + +Conversion of docx archive into Docx haskell type +-} + +module Text.Pandoc.Readers.Docx.Parse ( Docx(..) + , Document(..) + , Body(..) + , BodyPart(..) + , TblLook(..) + , Extent + , ParPart(..) + , Run(..) + , RunElem(..) + , Notes + , Numbering + , Relationship + , Media + , RunStyle(..) + , VertAlign(..) + , ParIndentation(..) + , ParagraphStyle(..) + , Row(..) + , Cell(..) + , archiveToDocx + , archiveToDocxWithWarnings + ) where +import Codec.Archive.Zip +import Text.XML.Light +import Data.Maybe +import Data.List +import System.FilePath +import Data.Bits ((.|.)) +import qualified Data.ByteString.Lazy as B +import qualified Text.Pandoc.UTF8 as UTF8 +import Control.Monad.Reader +import Control.Monad.State +import Control.Applicative ((<|>)) +import qualified Data.Map as M +import Control.Monad.Except +import Text.Pandoc.Shared (safeRead, filteredFilesFromArchive) +import Text.TeXMath.Readers.OMML (readOMML) +import Text.TeXMath.Unicode.Fonts (getUnicode, stringToFont, Font(..)) +import Text.TeXMath (Exp) +import Text.Pandoc.Readers.Docx.Util +import Data.Char (readLitChar, ord, chr, isDigit) + +data ReaderEnv = ReaderEnv { envNotes :: Notes + , envComments :: Comments + , envNumbering :: Numbering + , envRelationships :: [Relationship] + , envMedia :: Media + , envFont :: Maybe Font + , envCharStyles :: CharStyleMap + , envParStyles :: ParStyleMap + , envLocation :: DocumentLocation + } + deriving Show + +data ReaderState = ReaderState { stateWarnings :: [String] } + deriving Show + +data DocxError = DocxError | WrongElem + deriving Show + +type D = ExceptT DocxError (ReaderT ReaderEnv (State ReaderState)) + +runD :: D a -> ReaderEnv -> ReaderState -> (Either DocxError a, ReaderState) +runD dx re rs = runState (runReaderT (runExceptT dx) re) rs + +maybeToD :: Maybe a -> D a +maybeToD (Just a) = return a +maybeToD Nothing = throwError DocxError + +eitherToD :: Either a b -> D b +eitherToD (Right b) = return b +eitherToD (Left _) = throwError DocxError + +concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +concatMapM f xs = liftM concat (mapM f xs) + + +-- This is similar to `mapMaybe`: it maps a function returning the D +-- monad over a list, and only keeps the non-erroring return values. +mapD :: (a -> D b) -> [a] -> D [b] +mapD f xs = + let handler x = (f x >>= (\y-> return [y])) `catchError` (\_ -> return []) + in + concatMapM handler xs + +data Docx = Docx Document + deriving Show + +data Document = Document NameSpaces Body + deriving Show + +data Body = Body [BodyPart] + deriving Show + +type Media = [(FilePath, B.ByteString)] + +type CharStyle = (String, RunStyle) + +type ParStyle = (String, ParStyleData) + +type CharStyleMap = M.Map String RunStyle + +type ParStyleMap = M.Map String ParStyleData + +data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] + deriving Show + +data Numb = Numb String String -- right now, only a key to an abstract num + deriving Show + +data AbstractNumb = AbstractNumb String [Level] + deriving Show + +-- (ilvl, format, string, start) +type Level = (String, String, String, Maybe Integer) + +data DocumentLocation = InDocument | InFootnote | InEndnote + deriving (Eq,Show) + +data Relationship = Relationship DocumentLocation RelId Target + deriving Show + +data Notes = Notes NameSpaces + (Maybe (M.Map String Element)) + (Maybe (M.Map String Element)) + deriving Show + +data Comments = Comments NameSpaces (M.Map String Element) + deriving Show + +data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer + , rightParIndent :: Maybe Integer + , hangingParIndent :: Maybe Integer} + deriving Show + +data ParagraphStyle = ParagraphStyle { pStyle :: [String] + , indentation :: Maybe ParIndentation + , dropCap :: Bool + , pHeading :: Maybe (String, Int) + , pNumInfo :: Maybe (String, String) + , pBlockQuote :: Maybe Bool + } + deriving Show + +defaultParagraphStyle :: ParagraphStyle +defaultParagraphStyle = ParagraphStyle { pStyle = [] + , indentation = Nothing + , dropCap = False + , pHeading = Nothing + , pNumInfo = Nothing + , pBlockQuote = Nothing + } + + +data BodyPart = Paragraph ParagraphStyle [ParPart] + | ListItem ParagraphStyle String String (Maybe Level) [ParPart] + | Tbl String TblGrid TblLook [Row] + | OMathPara [Exp] + deriving Show + +type TblGrid = [Integer] + +data TblLook = TblLook {firstRowFormatting::Bool} + deriving Show + +defaultTblLook :: TblLook +defaultTblLook = TblLook{firstRowFormatting = False} + +data Row = Row [Cell] + deriving Show + +data Cell = Cell [BodyPart] + deriving Show + +-- (width, height) in EMUs +type Extent = Maybe (Double, Double) + +data ParPart = PlainRun Run + | Insertion ChangeId Author ChangeDate [Run] + | Deletion ChangeId Author ChangeDate [Run] + | CommentStart CommentId Author CommentDate [BodyPart] + | CommentEnd CommentId + | BookMark BookMarkId Anchor + | InternalHyperLink Anchor [Run] + | ExternalHyperLink URL [Run] + | Drawing FilePath String String B.ByteString Extent -- title, alt + | Chart -- placeholder for now + | PlainOMath [Exp] + | SmartTag [Run] + deriving Show + +data Run = Run RunStyle [RunElem] + | Footnote [BodyPart] + | Endnote [BodyPart] + | InlineDrawing FilePath String String B.ByteString Extent -- title, alt + | InlineChart -- placeholder + deriving Show + +data RunElem = TextRun String | LnBrk | Tab | SoftHyphen | NoBreakHyphen + deriving Show + +data VertAlign = BaseLn | SupScrpt | SubScrpt + deriving Show + +data RunStyle = RunStyle { isBold :: Maybe Bool + , isItalic :: Maybe Bool + , isSmallCaps :: Maybe Bool + , isStrike :: Maybe Bool + , rVertAlign :: Maybe VertAlign + , rUnderline :: Maybe String + , rStyle :: Maybe CharStyle} + deriving Show + +data ParStyleData = ParStyleData { headingLev :: Maybe (String, Int) + , isBlockQuote :: Maybe Bool + , numInfo :: Maybe (String, String) + , psStyle :: Maybe ParStyle} + deriving Show + +defaultRunStyle :: RunStyle +defaultRunStyle = RunStyle { isBold = Nothing + , isItalic = Nothing + , isSmallCaps = Nothing + , isStrike = Nothing + , rVertAlign = Nothing + , rUnderline = Nothing + , rStyle = Nothing} + +type Target = String +type Anchor = String +type URL = String +type BookMarkId = String +type RelId = String +type ChangeId = String +type CommentId = String +type Author = String +type ChangeDate = String +type CommentDate = String + +archiveToDocx :: Archive -> Either DocxError Docx +archiveToDocx archive = fst <$> archiveToDocxWithWarnings archive + +archiveToDocxWithWarnings :: Archive -> Either DocxError (Docx, [String]) +archiveToDocxWithWarnings archive = do + let notes = archiveToNotes archive + comments = archiveToComments archive + numbering = archiveToNumbering archive + rels = archiveToRelationships archive + media = filteredFilesFromArchive archive filePathIsMedia + (styles, parstyles) = archiveToStyles archive + rEnv = + ReaderEnv notes comments numbering rels media Nothing styles parstyles InDocument + rState = ReaderState { stateWarnings = [] } + (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState + case eitherDoc of + Right doc -> Right (Docx doc, stateWarnings st) + Left e -> Left e + + + +archiveToDocument :: Archive -> D Document +archiveToDocument zf = do + entry <- maybeToD $ findEntryByPath "word/document.xml" zf + docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let namespaces = elemToNameSpaces docElem + bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem + body <- elemToBody namespaces bodyElem + return $ Document namespaces body + +elemToBody :: NameSpaces -> Element -> D Body +elemToBody ns element | isElem ns "w" "body" element = + mapD (elemToBodyPart ns) (elChildren element) >>= + (\bps -> return $ Body bps) +elemToBody _ _ = throwError WrongElem + +archiveToStyles :: Archive -> (CharStyleMap, ParStyleMap) +archiveToStyles zf = + let stylesElem = findEntryByPath "word/styles.xml" zf >>= + (parseXMLDoc . UTF8.toStringLazy . fromEntry) + in + case stylesElem of + Nothing -> (M.empty, M.empty) + Just styElem -> + let namespaces = elemToNameSpaces styElem + in + ( M.fromList $ buildBasedOnList namespaces styElem + (Nothing :: Maybe CharStyle), + M.fromList $ buildBasedOnList namespaces styElem + (Nothing :: Maybe ParStyle) ) + +isBasedOnStyle :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> Bool +isBasedOnStyle ns element parentStyle + | isElem ns "w" "style" element + , Just styleType <- findAttrByName ns "w" "type" element + , styleType == cStyleType parentStyle + , Just basedOnVal <- findChildByName ns "w" "basedOn" element >>= + findAttrByName ns "w" "val" + , Just ps <- parentStyle = (basedOnVal == getStyleId ps) + | isElem ns "w" "style" element + , Just styleType <- findAttrByName ns "w" "type" element + , styleType == cStyleType parentStyle + , Nothing <- findChildByName ns "w" "basedOn" element + , Nothing <- parentStyle = True + | otherwise = False + +class ElemToStyle a where + cStyleType :: Maybe a -> String + elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a + getStyleId :: a -> String + +instance ElemToStyle CharStyle where + cStyleType _ = "character" + elemToStyle ns element parentStyle + | isElem ns "w" "style" element + , Just "character" <- findAttrByName ns "w" "type" element + , Just styleId <- findAttrByName ns "w" "styleId" element = + Just (styleId, elemToRunStyle ns element parentStyle) + | otherwise = Nothing + getStyleId s = fst s + +instance ElemToStyle ParStyle where + cStyleType _ = "paragraph" + elemToStyle ns element parentStyle + | isElem ns "w" "style" element + , Just "paragraph" <- findAttrByName ns "w" "type" element + , Just styleId <- findAttrByName ns "w" "styleId" element = + Just (styleId, elemToParStyleData ns element parentStyle) + | otherwise = Nothing + getStyleId s = fst s + +getStyleChildren :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a] +getStyleChildren ns element parentStyle + | isElem ns "w" "styles" element = + mapMaybe (\e -> elemToStyle ns e parentStyle) $ + filterChildren (\e' -> isBasedOnStyle ns e' parentStyle) element + | otherwise = [] + +buildBasedOnList :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a] +buildBasedOnList ns element rootStyle = + case (getStyleChildren ns element rootStyle) of + [] -> [] + stys -> stys ++ + (concatMap (\s -> buildBasedOnList ns element (Just s)) stys) + +archiveToNotes :: Archive -> Notes +archiveToNotes zf = + let fnElem = findEntryByPath "word/footnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + enElem = findEntryByPath "word/endnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + fn_namespaces = case fnElem of + Just e -> elemToNameSpaces e + Nothing -> [] + en_namespaces = case enElem of + Just e -> elemToNameSpaces e + Nothing -> [] + ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces + fn = fnElem >>= (elemToNotes ns "footnote") + en = enElem >>= (elemToNotes ns "endnote") + in + Notes ns fn en + +archiveToComments :: Archive -> Comments +archiveToComments zf = + let cmtsElem = findEntryByPath "word/comments.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + cmts_namespaces = case cmtsElem of + Just e -> elemToNameSpaces e + Nothing -> [] + cmts = (elemToComments cmts_namespaces) <$> cmtsElem + in + case cmts of + Just c -> Comments cmts_namespaces c + Nothing -> Comments cmts_namespaces M.empty + +filePathToRelType :: FilePath -> Maybe DocumentLocation +filePathToRelType "word/_rels/document.xml.rels" = Just InDocument +filePathToRelType "word/_rels/footnotes.xml.rels" = Just InFootnote +filePathToRelType "word/_rels/endnotes.xml.rels" = Just InEndnote +filePathToRelType _ = Nothing + +relElemToRelationship :: DocumentLocation -> Element -> Maybe Relationship +relElemToRelationship relType element | qName (elName element) == "Relationship" = + do + relId <- findAttr (QName "Id" Nothing Nothing) element + target <- findAttr (QName "Target" Nothing Nothing) element + return $ Relationship relType relId target +relElemToRelationship _ _ = Nothing + +filePathToRelationships :: Archive -> FilePath -> [Relationship] +filePathToRelationships ar fp | Just relType <- filePathToRelType fp + , Just entry <- findEntryByPath fp ar + , Just relElems <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry = + mapMaybe (relElemToRelationship relType) $ elChildren relElems +filePathToRelationships _ _ = [] + +archiveToRelationships :: Archive -> [Relationship] +archiveToRelationships archive = + concatMap (filePathToRelationships archive) $ filesInArchive archive + +filePathIsMedia :: FilePath -> Bool +filePathIsMedia fp = + let (dir, _) = splitFileName fp + in + (dir == "word/media/") + +lookupLevel :: String -> String -> Numbering -> Maybe Level +lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do + absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs + lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs + lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls + return lvl + + +numElemToNum :: NameSpaces -> Element -> Maybe Numb +numElemToNum ns element + | isElem ns "w" "num" element = do + numId <- findAttrByName ns "w" "numId" element + absNumId <- findChildByName ns "w" "abstractNumId" element + >>= findAttrByName ns "w" "val" + return $ Numb numId absNumId +numElemToNum _ _ = Nothing + +absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb +absNumElemToAbsNum ns element + | isElem ns "w" "abstractNum" element = do + absNumId <- findAttrByName ns "w" "abstractNumId" element + let levelElems = findChildrenByName ns "w" "lvl" element + levels = mapMaybe (levelElemToLevel ns) levelElems + return $ AbstractNumb absNumId levels +absNumElemToAbsNum _ _ = Nothing + +levelElemToLevel :: NameSpaces -> Element -> Maybe Level +levelElemToLevel ns element + | isElem ns "w" "lvl" element = do + ilvl <- findAttrByName ns "w" "ilvl" element + fmt <- findChildByName ns "w" "numFmt" element + >>= findAttrByName ns "w" "val" + txt <- findChildByName ns "w" "lvlText" element + >>= findAttrByName ns "w" "val" + let start = findChildByName ns "w" "start" element + >>= findAttrByName ns "w" "val" + >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + return (ilvl, fmt, txt, start) +levelElemToLevel _ _ = Nothing + +archiveToNumbering' :: Archive -> Maybe Numbering +archiveToNumbering' zf = do + case findEntryByPath "word/numbering.xml" zf of + Nothing -> Just $ Numbering [] [] [] + Just entry -> do + numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let namespaces = elemToNameSpaces numberingElem + numElems = findChildrenByName namespaces "w" "num" numberingElem + absNumElems = findChildrenByName namespaces "w" "abstractNum" numberingElem + nums = mapMaybe (numElemToNum namespaces) numElems + absNums = mapMaybe (absNumElemToAbsNum namespaces) absNumElems + return $ Numbering namespaces nums absNums + +archiveToNumbering :: Archive -> Numbering +archiveToNumbering archive = + fromMaybe (Numbering [] [] []) (archiveToNumbering' archive) + +elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map String Element) +elemToNotes ns notetype element + | isElem ns "w" (notetype ++ "s") element = + let pairs = mapMaybe + (\e -> findAttrByName ns "w" "id" e >>= + (\a -> Just (a, e))) + (findChildrenByName ns "w" notetype element) + in + Just $ M.fromList $ pairs +elemToNotes _ _ _ = Nothing + +elemToComments :: NameSpaces -> Element -> M.Map String Element +elemToComments ns element + | isElem ns "w" "comments" element = + let pairs = mapMaybe + (\e -> findAttrByName ns "w" "id" e >>= + (\a -> Just (a, e))) + (findChildrenByName ns "w" "comment" element) + in + M.fromList $ pairs +elemToComments _ _ = M.empty + + +--------------------------------------------- +--------------------------------------------- + +elemToTblGrid :: NameSpaces -> Element -> D TblGrid +elemToTblGrid ns element | isElem ns "w" "tblGrid" element = + let cols = findChildrenByName ns "w" "gridCol" element + in + mapD (\e -> maybeToD (findAttrByName ns "w" "val" e >>= stringToInteger)) + cols +elemToTblGrid _ _ = throwError WrongElem + +elemToTblLook :: NameSpaces -> Element -> D TblLook +elemToTblLook ns element | isElem ns "w" "tblLook" element = + let firstRow = findAttrByName ns "w" "firstRow" element + val = findAttrByName ns "w" "val" element + firstRowFmt = + case firstRow of + Just "1" -> True + Just _ -> False + Nothing -> case val of + Just bitMask -> testBitMask bitMask 0x020 + Nothing -> False + in + return $ TblLook{firstRowFormatting = firstRowFmt} +elemToTblLook _ _ = throwError WrongElem + +elemToRow :: NameSpaces -> Element -> D Row +elemToRow ns element | isElem ns "w" "tr" element = + do + let cellElems = findChildrenByName ns "w" "tc" element + cells <- mapD (elemToCell ns) cellElems + return $ Row cells +elemToRow _ _ = throwError WrongElem + +elemToCell :: NameSpaces -> Element -> D Cell +elemToCell ns element | isElem ns "w" "tc" element = + do + cellContents <- mapD (elemToBodyPart ns) (elChildren element) + return $ Cell cellContents +elemToCell _ _ = throwError WrongElem + +elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation +elemToParIndentation ns element | isElem ns "w" "ind" element = + Just $ ParIndentation { + leftParIndent = + findAttrByName ns "w" "left" element >>= + stringToInteger + , rightParIndent = + findAttrByName ns "w" "right" element >>= + stringToInteger + , hangingParIndent = + findAttrByName ns "w" "hanging" element >>= + stringToInteger} +elemToParIndentation _ _ = Nothing + +testBitMask :: String -> Int -> Bool +testBitMask bitMaskS n = + case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of + [] -> False + ((n', _) : _) -> ((n' .|. n) /= 0) + +stringToInteger :: String -> Maybe Integer +stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) + +elemToBodyPart :: NameSpaces -> Element -> D BodyPart +elemToBodyPart ns element + | isElem ns "w" "p" element + , (c:_) <- findChildrenByName ns "m" "oMathPara" element = + do + expsLst <- eitherToD $ readOMML $ showElement c + return $ OMathPara expsLst +elemToBodyPart ns element + | isElem ns "w" "p" element + , Just (numId, lvl) <- getNumInfo ns element = do + sty <- asks envParStyles + let parstyle = elemToParagraphStyle ns element sty + parparts <- mapD (elemToParPart ns) (elChildren element) + num <- asks envNumbering + let levelInfo = lookupLevel numId lvl num + return $ ListItem parstyle numId lvl levelInfo parparts +elemToBodyPart ns element + | isElem ns "w" "p" element = do + sty <- asks envParStyles + let parstyle = elemToParagraphStyle ns element sty + parparts <- mapD (elemToParPart ns) (elChildren element) + -- Word uses list enumeration for numbered headings, so we only + -- want to infer a list from the styles if it is NOT a heading. + case pHeading parstyle of + Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do + num <- asks envNumbering + let levelInfo = lookupLevel numId lvl num + return $ ListItem parstyle numId lvl levelInfo parparts + _ -> return $ Paragraph parstyle parparts +elemToBodyPart ns element + | isElem ns "w" "tbl" element = do + let caption' = findChildByName ns "w" "tblPr" element + >>= findChildByName ns "w" "tblCaption" + >>= findAttrByName ns "w" "val" + caption = (fromMaybe "" caption') + grid' = case findChildByName ns "w" "tblGrid" element of + Just g -> elemToTblGrid ns g + Nothing -> return [] + tblLook' = case findChildByName ns "w" "tblPr" element >>= + findChildByName ns "w" "tblLook" + of + Just l -> elemToTblLook ns l + Nothing -> return defaultTblLook + + grid <- grid' + tblLook <- tblLook' + rows <- mapD (elemToRow ns) (elChildren element) + return $ Tbl caption grid tblLook rows +elemToBodyPart _ _ = throwError WrongElem + +lookupRelationship :: DocumentLocation -> RelId -> [Relationship] -> Maybe Target +lookupRelationship docLocation relid rels = + lookup (docLocation, relid) pairs + where + pairs = map (\(Relationship loc relid' target) -> ((loc, relid'), target)) rels + +expandDrawingId :: String -> D (FilePath, B.ByteString) +expandDrawingId s = do + location <- asks envLocation + target <- asks (lookupRelationship location s . envRelationships) + case target of + Just filepath -> do + bytes <- asks (lookup ("word/" ++ filepath) . envMedia) + case bytes of + Just bs -> return (filepath, bs) + Nothing -> throwError DocxError + Nothing -> throwError DocxError + +getTitleAndAlt :: NameSpaces -> Element -> (String, String) +getTitleAndAlt ns element = + let mbDocPr = findChildByName ns "wp" "inline" element >>= + findChildByName ns "wp" "docPr" + title = case mbDocPr >>= findAttrByName ns "" "title" of + Just title' -> title' + Nothing -> "" + alt = case mbDocPr >>= findAttrByName ns "" "descr" of + Just alt' -> alt' + Nothing -> "" + in (title, alt) + +elemToParPart :: NameSpaces -> Element -> D ParPart +elemToParPart ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChildByName ns "w" "drawing" element + , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture" + , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem + = let (title, alt) = getTitleAndAlt ns drawingElem + a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem + >>= findAttrByName ns "r" "embed" + in + case drawing of + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) + Nothing -> throwError WrongElem +-- The below is an attempt to deal with images in deprecated vml format. +elemToParPart ns element + | isElem ns "w" "r" element + , Just _ <- findChildByName ns "w" "pict" element = + let drawing = findElement (elemName ns "v" "imagedata") element + >>= findAttrByName ns "r" "id" + in + case drawing of + -- Todo: check out title and attr for deprecated format. + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) + Nothing -> throwError WrongElem +-- Chart +elemToParPart ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChildByName ns "w" "drawing" element + , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" + , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem + = return Chart +elemToParPart ns element + | isElem ns "w" "r" element = + elemToRun ns element >>= (\r -> return $ PlainRun r) +elemToParPart ns element + | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , Just cDate <- findAttrByName ns "w" "date" element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ Insertion cId cAuthor cDate runs +elemToParPart ns element + | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , Just cDate <- findAttrByName ns "w" "date" element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ Deletion cId cAuthor cDate runs +elemToParPart ns element + | isElem ns "w" "smartTag" element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ SmartTag runs +elemToParPart ns element + | isElem ns "w" "bookmarkStart" element + , Just bmId <- findAttrByName ns "w" "id" element + , Just bmName <- findAttrByName ns "w" "name" element = + return $ BookMark bmId bmName +elemToParPart ns element + | isElem ns "w" "hyperlink" element + , Just relId <- findAttrByName ns "r" "id" element = do + location <- asks envLocation + runs <- mapD (elemToRun ns) (elChildren element) + rels <- asks envRelationships + case lookupRelationship location relId rels of + Just target -> do + case findAttrByName ns "w" "anchor" element of + Just anchor -> return $ ExternalHyperLink (target ++ '#':anchor) runs + Nothing -> return $ ExternalHyperLink target runs + Nothing -> return $ ExternalHyperLink "" runs +elemToParPart ns element + | isElem ns "w" "hyperlink" element + , Just anchor <- findAttrByName ns "w" "anchor" element = do + runs <- mapD (elemToRun ns) (elChildren element) + return $ InternalHyperLink anchor runs +elemToParPart ns element + | isElem ns "w" "commentRangeStart" element + , Just cmtId <- findAttrByName ns "w" "id" element = do + (Comments _ commentMap) <- asks envComments + case M.lookup cmtId commentMap of + Just cmtElem -> elemToCommentStart ns cmtElem + Nothing -> throwError WrongElem +elemToParPart ns element + | isElem ns "w" "commentRangeEnd" element + , Just cmtId <- findAttrByName ns "w" "id" element = + return $ CommentEnd cmtId +elemToParPart ns element + | isElem ns "m" "oMath" element = + (eitherToD $ readOMML $ showElement element) >>= (return . PlainOMath) +elemToParPart _ _ = throwError WrongElem + +elemToCommentStart :: NameSpaces -> Element -> D ParPart +elemToCommentStart ns element + | isElem ns "w" "comment" element + , Just cmtId <- findAttrByName ns "w" "id" element + , Just cmtAuthor <- findAttrByName ns "w" "author" element + , Just cmtDate <- findAttrByName ns "w" "date" element = do + bps <- mapD (elemToBodyPart ns) (elChildren element) + return $ CommentStart cmtId cmtAuthor cmtDate bps +elemToCommentStart _ _ = throwError WrongElem + +lookupFootnote :: String -> Notes -> Maybe Element +lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s) + +lookupEndnote :: String -> Notes -> Maybe Element +lookupEndnote s (Notes _ _ ens) = ens >>= (M.lookup s) + +elemToExtent :: Element -> Extent +elemToExtent drawingElem = + case (getDim "cx", getDim "cy") of + (Just w, Just h) -> Just (w, h) + _ -> Nothing + where + wp_ns = "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" + getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem + >>= findAttr (QName at Nothing Nothing) >>= safeRead + + +childElemToRun :: NameSpaces -> Element -> D Run +childElemToRun ns element + | isElem ns "w" "drawing" element + , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture" + , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) element + = let (title, alt) = getTitleAndAlt ns element + a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) + in + case drawing of + Just s -> expandDrawingId s >>= + (\(fp, bs) -> return $ InlineDrawing fp title alt bs $ elemToExtent element) + Nothing -> throwError WrongElem +childElemToRun ns element + | isElem ns "w" "drawing" element + , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" + , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) element + = return InlineChart +childElemToRun ns element + | isElem ns "w" "footnoteReference" element + , Just fnId <- findAttrByName ns "w" "id" element = do + notes <- asks envNotes + case lookupFootnote fnId notes of + Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e) + return $ Footnote bps + Nothing -> return $ Footnote [] +childElemToRun ns element + | isElem ns "w" "endnoteReference" element + , Just enId <- findAttrByName ns "w" "id" element = do + notes <- asks envNotes + case lookupEndnote enId notes of + Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e) + return $ Endnote bps + Nothing -> return $ Endnote [] +childElemToRun _ _ = throwError WrongElem + +elemToRun :: NameSpaces -> Element -> D Run +elemToRun ns element + | isElem ns "w" "r" element + , Just altCont <- findChildByName ns "mc" "AlternateContent" element = + do let choices = findChildrenByName ns "mc" "Choice" altCont + choiceChildren = map head $ filter (not . null) $ map elChildren choices + outputs <- mapD (childElemToRun ns) choiceChildren + case outputs of + r : _ -> return r + [] -> throwError WrongElem +elemToRun ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChildByName ns "w" "drawing" element = + childElemToRun ns drawingElem +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChildByName ns "w" "footnoteReference" element = + childElemToRun ns ref +elemToRun ns element + | isElem ns "w" "r" element + , Just ref <- findChildByName ns "w" "endnoteReference" element = + childElemToRun ns ref +elemToRun ns element + | isElem ns "w" "r" element = do + runElems <- elemToRunElems ns element + runStyle <- elemToRunStyleD ns element + return $ Run runStyle runElems +elemToRun _ _ = throwError WrongElem + +getParentStyleValue :: (ParStyleData -> Maybe a) -> ParStyleData -> Maybe a +getParentStyleValue field style + | Just value <- field style = Just value + | Just parentStyle <- psStyle style + = getParentStyleValue field (snd parentStyle) +getParentStyleValue _ _ = Nothing + +getParStyleField :: (ParStyleData -> Maybe a) -> ParStyleMap -> [String] -> + Maybe a +getParStyleField field stylemap styles + | x <- mapMaybe (\x -> M.lookup x stylemap) styles + , (y:_) <- mapMaybe (getParentStyleValue field) x + = Just y +getParStyleField _ _ _ = Nothing + +elemToParagraphStyle :: NameSpaces -> Element -> ParStyleMap -> ParagraphStyle +elemToParagraphStyle ns element sty + | Just pPr <- findChildByName ns "w" "pPr" element = + let style = + mapMaybe + (findAttrByName ns "w" "val") + (findChildrenByName ns "w" "pStyle" pPr) + in ParagraphStyle + {pStyle = style + , indentation = + findChildByName ns "w" "ind" pPr >>= + elemToParIndentation ns + , dropCap = + case + findChildByName ns "w" "framePr" pPr >>= + findAttrByName ns "w" "dropCap" + of + Just "none" -> False + Just _ -> True + Nothing -> False + , pHeading = getParStyleField headingLev sty style + , pNumInfo = getParStyleField numInfo sty style + , pBlockQuote = getParStyleField isBlockQuote sty style + } +elemToParagraphStyle _ _ _ = defaultParagraphStyle + +checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool +checkOnOff ns rPr tag + | Just t <- findChild tag rPr + , Just val <- findAttrByName ns "w" "val" t = + Just $ case val of + "true" -> True + "false" -> False + "on" -> True + "off" -> False + "1" -> True + "0" -> False + _ -> False + | Just _ <- findChild tag rPr = Just True +checkOnOff _ _ _ = Nothing + +elemToRunStyleD :: NameSpaces -> Element -> D RunStyle +elemToRunStyleD ns element + | Just rPr <- findChildByName ns "w" "rPr" element = do + charStyles <- asks envCharStyles + let parentSty = case + findChildByName ns "w" "rStyle" rPr >>= + findAttrByName ns "w" "val" + of + Just styName | Just style <- M.lookup styName charStyles -> + Just (styName, style) + _ -> Nothing + return $ elemToRunStyle ns element parentSty +elemToRunStyleD _ _ = return defaultRunStyle + +elemToRunStyle :: NameSpaces -> Element -> Maybe CharStyle -> RunStyle +elemToRunStyle ns element parentStyle + | Just rPr <- findChildByName ns "w" "rPr" element = + RunStyle + { + isBold = checkOnOff ns rPr (elemName ns "w" "b") + , isItalic = checkOnOff ns rPr (elemName ns "w" "i") + , isSmallCaps = checkOnOff ns rPr (elemName ns "w" "smallCaps") + , isStrike = checkOnOff ns rPr (elemName ns "w" "strike") + , rVertAlign = + findChildByName ns "w" "vertAlign" rPr >>= + findAttrByName ns "w" "val" >>= + \v -> Just $ case v of + "superscript" -> SupScrpt + "subscript" -> SubScrpt + _ -> BaseLn + , rUnderline = + findChildByName ns "w" "u" rPr >>= + findAttrByName ns "w" "val" + , rStyle = parentStyle + } +elemToRunStyle _ _ _ = defaultRunStyle + +isNumericNotNull :: String -> Bool +isNumericNotNull str = (str /= []) && (all isDigit str) + +getHeaderLevel :: NameSpaces -> Element -> Maybe (String,Int) +getHeaderLevel ns element + | Just styleId <- findAttrByName ns "w" "styleId" element + , Just index <- stripPrefix "Heading" styleId + , isNumericNotNull index = Just (styleId, read index) + | Just styleId <- findAttrByName ns "w" "styleId" element + , Just index <- findChildByName ns "w" "name" element >>= + findAttrByName ns "w" "val" >>= + stripPrefix "heading " + , isNumericNotNull index = Just (styleId, read index) +getHeaderLevel _ _ = Nothing + +blockQuoteStyleIds :: [String] +blockQuoteStyleIds = ["Quote", "BlockQuote", "BlockQuotation"] + +blockQuoteStyleNames :: [String] +blockQuoteStyleNames = ["Quote", "Block Text"] + +getBlockQuote :: NameSpaces -> Element -> Maybe Bool +getBlockQuote ns element + | Just styleId <- findAttrByName ns "w" "styleId" element + , styleId `elem` blockQuoteStyleIds = Just True + | Just styleName <- findChildByName ns "w" "name" element >>= + findAttrByName ns "w" "val" + , styleName `elem` blockQuoteStyleNames = Just True +getBlockQuote _ _ = Nothing + +getNumInfo :: NameSpaces -> Element -> Maybe (String, String) +getNumInfo ns element = do + let numPr = findChildByName ns "w" "pPr" element >>= + findChildByName ns "w" "numPr" + lvl = fromMaybe "0" (numPr >>= + findChildByName ns "w" "ilvl" >>= + findAttrByName ns "w" "val") + numId <- numPr >>= + findChildByName ns "w" "numId" >>= + findAttrByName ns "w" "val" + return (numId, lvl) + + +elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> ParStyleData +elemToParStyleData ns element parentStyle = + ParStyleData + { + headingLev = getHeaderLevel ns element + , isBlockQuote = getBlockQuote ns element + , numInfo = getNumInfo ns element + , psStyle = parentStyle + } + +elemToRunElem :: NameSpaces -> Element -> D RunElem +elemToRunElem ns element + | isElem ns "w" "t" element + || isElem ns "w" "delText" element + || isElem ns "m" "t" element = do + let str = strContent element + font <- asks envFont + case font of + Nothing -> return $ TextRun str + Just f -> return . TextRun $ + map (\x -> fromMaybe x . getUnicode f . lowerFromPrivate $ x) str + | isElem ns "w" "br" element = return LnBrk + | isElem ns "w" "tab" element = return Tab + | isElem ns "w" "softHyphen" element = return SoftHyphen + | isElem ns "w" "noBreakHyphen" element = return NoBreakHyphen + | isElem ns "w" "sym" element = return (getSymChar ns element) + | otherwise = throwError WrongElem + where + lowerFromPrivate (ord -> c) + | c >= ord '\xF000' = chr $ c - ord '\xF000' + | otherwise = chr c + +-- The char attribute is a hex string +getSymChar :: NameSpaces -> Element -> RunElem +getSymChar ns element + | Just s <- lowerFromPrivate <$> getCodepoint + , Just font <- getFont = + let [(char, _)] = readLitChar ("\\x" ++ s) in + TextRun . maybe "" (:[]) $ getUnicode font char + where + getCodepoint = findAttrByName ns "w" "char" element + getFont = stringToFont =<< findAttrByName ns "w" "font" element + lowerFromPrivate ('F':xs) = '0':xs + lowerFromPrivate xs = xs +getSymChar _ _ = TextRun "" + +elemToRunElems :: NameSpaces -> Element -> D [RunElem] +elemToRunElems ns element + | isElem ns "w" "r" element + || isElem ns "m" "r" element = do + let qualName = elemName ns "w" + let font = do + fontElem <- findElement (qualName "rFonts") element + stringToFont =<< + (foldr (<|>) Nothing $ + map (flip findAttr fontElem . qualName) ["ascii", "hAnsi"]) + local (setFont font) (mapD (elemToRunElem ns) (elChildren element)) +elemToRunElems _ _ = throwError WrongElem + +setFont :: Maybe Font -> ReaderEnv -> ReaderEnv +setFont f s = s{envFont = f} + diff --git a/src/Text/Pandoc/Readers/Docx/StyleMap.hs b/src/Text/Pandoc/Readers/Docx/StyleMap.hs new file mode 100644 index 000000000..00906cf07 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/StyleMap.hs @@ -0,0 +1,108 @@ +module Text.Pandoc.Readers.Docx.StyleMap ( StyleMaps(..) + , alterMap + , getMap + , defaultStyleMaps + , getStyleMaps + , getStyleId + , hasStyleName + ) where + +import Text.XML.Light +import Text.Pandoc.Readers.Docx.Util +import Control.Monad.State +import Data.Char (toLower) +import qualified Data.Map as M + +newtype ParaStyleMap = ParaStyleMap ( M.Map String String ) +newtype CharStyleMap = CharStyleMap ( M.Map String String ) + +class StyleMap a where + alterMap :: (M.Map String String -> M.Map String String) -> a -> a + getMap :: a -> M.Map String String + +instance StyleMap ParaStyleMap where + alterMap f (ParaStyleMap m) = ParaStyleMap $ f m + getMap (ParaStyleMap m) = m + +instance StyleMap CharStyleMap where + alterMap f (CharStyleMap m) = CharStyleMap $ f m + getMap (CharStyleMap m) = m + +insert :: (StyleMap a) => Maybe String -> Maybe String -> a -> a +insert (Just k) (Just v) m = alterMap (M.insert k v) m +insert _ _ m = m + +getStyleId :: (StyleMap a) => String -> a -> String +getStyleId s = M.findWithDefault (filter (/=' ') s) (map toLower s) . getMap + +hasStyleName :: (StyleMap a) => String -> a -> Bool +hasStyleName styleName = M.member (map toLower styleName) . getMap + +data StyleMaps = StyleMaps { sNameSpaces :: NameSpaces + , sParaStyleMap :: ParaStyleMap + , sCharStyleMap :: CharStyleMap + } + +data StyleType = ParaStyle | CharStyle + +defaultStyleMaps :: StyleMaps +defaultStyleMaps = StyleMaps { sNameSpaces = [] + , sParaStyleMap = ParaStyleMap M.empty + , sCharStyleMap = CharStyleMap M.empty + } + +type StateM a = State StyleMaps a + +getStyleMaps :: Element -> StyleMaps +getStyleMaps docElem = execState genStyleMap state' + where + state' = defaultStyleMaps {sNameSpaces = elemToNameSpaces docElem} + genStyleItem e = do + styleType <- getStyleType e + styleId <- getAttrStyleId e + nameValLowercase <- fmap (map toLower) `fmap` getNameVal e + case styleType of + Just ParaStyle -> modParaStyleMap $ insert nameValLowercase styleId + Just CharStyle -> modCharStyleMap $ insert nameValLowercase styleId + _ -> return () + genStyleMap = do + style <- elemName' "style" + let styles = findChildren style docElem + forM_ styles genStyleItem + +modParaStyleMap :: (ParaStyleMap -> ParaStyleMap) -> StateM () +modParaStyleMap f = modify $ \s -> + s {sParaStyleMap = f $ sParaStyleMap s} + +modCharStyleMap :: (CharStyleMap -> CharStyleMap) -> StateM () +modCharStyleMap f = modify $ \s -> + s {sCharStyleMap = f $ sCharStyleMap s} + +getStyleType :: Element -> StateM (Maybe StyleType) +getStyleType e = do + styleTypeStr <- getAttrType e + case styleTypeStr of + Just "paragraph" -> return $ Just ParaStyle + Just "character" -> return $ Just CharStyle + _ -> return Nothing + +getAttrType :: Element -> StateM (Maybe String) +getAttrType el = do + name <- elemName' "type" + return $ findAttr name el + +getAttrStyleId :: Element -> StateM (Maybe String) +getAttrStyleId el = do + name <- elemName' "styleId" + return $ findAttr name el + +getNameVal :: Element -> StateM (Maybe String) +getNameVal el = do + name <- elemName' "name" + val <- elemName' "val" + return $ findChild name el >>= findAttr val + +elemName' :: String -> StateM QName +elemName' name = do + namespaces <- gets sNameSpaces + return $ elemName namespaces "w" name diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs new file mode 100644 index 000000000..6646e5b7f --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -0,0 +1,47 @@ +module Text.Pandoc.Readers.Docx.Util ( + NameSpaces + , elemName + , isElem + , elemToNameSpaces + , findChildByName + , findChildrenByName + , findAttrByName + ) where + +import Text.XML.Light +import Data.Maybe (mapMaybe) + +type NameSpaces = [(String, String)] + +elemToNameSpaces :: Element -> NameSpaces +elemToNameSpaces = mapMaybe attrToNSPair . elAttribs + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) +attrToNSPair _ = Nothing + +elemName :: NameSpaces -> String -> String -> QName +elemName ns prefix name = + QName name (lookup prefix ns) (if null prefix then Nothing else Just prefix) + +isElem :: NameSpaces -> String -> String -> Element -> Bool +isElem ns prefix name element = + let ns' = ns ++ elemToNameSpaces element + in qName (elName element) == name && + qURI (elName element) == lookup prefix ns' + +findChildByName :: NameSpaces -> String -> String -> Element -> Maybe Element +findChildByName ns pref name el = + let ns' = ns ++ elemToNameSpaces el + in findChild (elemName ns' pref name) el + +findChildrenByName :: NameSpaces -> String -> String -> Element -> [Element] +findChildrenByName ns pref name el = + let ns' = ns ++ elemToNameSpaces el + in findChildren (elemName ns' pref name) el + +findAttrByName :: NameSpaces -> String -> String -> Element -> Maybe String +findAttrByName ns pref name el = + let ns' = ns ++ elemToNameSpaces el + in findAttr (elemName ns' pref name) el + diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs new file mode 100644 index 000000000..2eaa842b6 --- /dev/null +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -0,0 +1,279 @@ +{-# LANGUAGE + ViewPatterns + , StandaloneDeriving + , TupleSections + , FlexibleContexts #-} + +module Text.Pandoc.Readers.EPUB + (readEPUB) + where + +import Text.XML.Light +import Text.Pandoc.Definition hiding (Attr) +import Text.Pandoc.Readers.HTML (readHtml) +import Text.Pandoc.Walk (walk, query) +import Text.Pandoc.Options ( ReaderOptions(..)) +import Text.Pandoc.Extensions (enableExtension, Extension(Ext_raw_html)) +import Text.Pandoc.Shared (escapeURI, collapseFilePath, addMetaField) +import Network.URI (unEscapeString) +import Text.Pandoc.MediaBag (MediaBag, insertMedia) +import Control.Monad.Except (throwError) +import Text.Pandoc.MIME (MimeType) +import qualified Text.Pandoc.Builder as B +import Codec.Archive.Zip ( Archive (..), toArchiveOrFail, fromEntry + , findEntryByPath, Entry) +import qualified Data.ByteString.Lazy as BL (ByteString) +import System.FilePath ( takeFileName, (</>), dropFileName, normalise + , dropFileName + , splitFileName ) +import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) +import Control.Monad (guard, liftM) +import Data.List (isPrefixOf, isInfixOf) +import Data.Maybe (mapMaybe, fromMaybe) +import qualified Data.Map as M (Map, lookup, fromList, elems) +import Data.Monoid ((<>)) +import Control.DeepSeq (deepseq, NFData) +import Text.Pandoc.Error +import Text.Pandoc.Class (PandocMonad) +import qualified Text.Pandoc.Class as P + +type Items = M.Map String (FilePath, MimeType) + +readEPUB :: PandocMonad m => ReaderOptions -> BL.ByteString -> m Pandoc +readEPUB opts bytes = case toArchiveOrFail bytes of + Right archive -> archiveToEPUB opts $ archive + Left _ -> throwError $ PandocParseError "Couldn't extract ePub file" + +-- runEPUB :: Except PandocError a -> Either PandocError a +-- runEPUB = runExcept + +-- Note that internal reference are aggresively normalised so that all ids +-- are of the form "filename#id" +-- +archiveToEPUB :: (PandocMonad m) => ReaderOptions -> Archive -> m Pandoc +archiveToEPUB os archive = do + -- root is path to folder with manifest file in + (root, content) <- getManifest archive + meta <- parseMeta content + (cover, items) <- parseManifest content + -- No need to collapse here as the image path is from the manifest file + let coverDoc = fromMaybe mempty (imageToPandoc <$> cover) + spine <- parseSpine items content + let escapedSpine = map (escapeURI . takeFileName . fst) spine + Pandoc _ bs <- + foldM' (\a b -> ((a <>) . walk (prependHash escapedSpine)) + `liftM` parseSpineElem root b) mempty spine + let ast = coverDoc <> (Pandoc meta bs) + P.setMediaBag $ fetchImages (M.elems items) root archive ast + return ast + where + os' = os {readerExtensions = enableExtension Ext_raw_html (readerExtensions os)} + parseSpineElem :: PandocMonad m => FilePath -> (FilePath, MimeType) -> m Pandoc + parseSpineElem (normalise -> r) (normalise -> path, mime) = do + doc <- mimeToReader mime r path + let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty + return $ docSpan <> doc + mimeToReader :: PandocMonad m => MimeType -> FilePath -> FilePath -> m Pandoc + mimeToReader "application/xhtml+xml" (unEscapeString -> root) + (unEscapeString -> path) = do + fname <- findEntryByPathE (root </> path) archive + html <- readHtml os' . UTF8.toStringLazy $ fromEntry fname + return $ fixInternalReferences path html + mimeToReader s _ (unEscapeString -> path) + | s `elem` imageMimes = return $ imageToPandoc path + | otherwise = return $ mempty + +-- paths should be absolute when this function is called +-- renameImages should do this +fetchImages :: [(FilePath, MimeType)] + -> FilePath -- ^ Root + -> Archive + -> Pandoc + -> MediaBag +fetchImages mimes root arc (query iq -> links) = + foldr (uncurry3 insertMedia) mempty + (mapMaybe getEntry links) + where + getEntry link = + let abslink = normalise (root </> link) in + (link , lookup link mimes, ) . fromEntry + <$> findEntryByPath abslink arc + +iq :: Inline -> [FilePath] +iq (Image _ _ (url, _)) = [url] +iq _ = [] + +-- Remove relative paths +renameImages :: FilePath -> Inline -> Inline +renameImages root img@(Image attr a (url, b)) + | "data:" `isPrefixOf` url = img + | otherwise = Image attr a (collapseFilePath (root </> url), b) +renameImages _ x = x + +imageToPandoc :: FilePath -> Pandoc +imageToPandoc s = B.doc . B.para $ B.image s "" mempty + +imageMimes :: [MimeType] +imageMimes = ["image/gif", "image/jpeg", "image/png"] + +type CoverImage = FilePath + +parseManifest :: (PandocMonad m) => Element -> m (Maybe CoverImage, Items) +parseManifest content = do + manifest <- findElementE (dfName "manifest") content + let items = findChildren (dfName "item") manifest + r <- mapM parseItem items + let cover = findAttr (emptyName "href") =<< filterChild findCover manifest + return (cover, (M.fromList r)) + where + findCover e = maybe False (isInfixOf "cover-image") + (findAttr (emptyName "properties") e) + parseItem e = do + uid <- findAttrE (emptyName "id") e + href <- findAttrE (emptyName "href") e + mime <- findAttrE (emptyName "media-type") e + return (uid, (href, mime)) + +parseSpine :: PandocMonad m => Items -> Element -> m [(FilePath, MimeType)] +parseSpine is e = do + spine <- findElementE (dfName "spine") e + let itemRefs = findChildren (dfName "itemref") spine + mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs + where + parseItemRef ref = do + let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref) + guard linear + findAttr (emptyName "idref") ref + +parseMeta :: PandocMonad m => Element -> m Meta +parseMeta content = do + meta <- findElementE (dfName "metadata") content + let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True + dcspace _ = False + let dcs = filterChildrenName dcspace meta + let r = foldr parseMetaItem nullMeta dcs + return r + +-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem +parseMetaItem :: Element -> Meta -> Meta +parseMetaItem e@(stripNamespace . elName -> field) meta = + addMetaField (renameMeta field) (B.str $ strContent e) meta + +renameMeta :: String -> String +renameMeta "creator" = "author" +renameMeta s = s + +getManifest :: PandocMonad m => Archive -> m (String, Element) +getManifest archive = do + metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive + docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry + let namespaces = mapMaybe attrToNSPair (elAttribs docElem) + ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) + as <- liftM ((map attrToPair) . elAttribs) + (findElementE (QName "rootfile" (Just ns) Nothing) docElem) + manifestFile <- mkE "Root not found" (lookup "full-path" as) + let rootdir = dropFileName manifestFile + --mime <- lookup "media-type" as + manifest <- findEntryByPathE manifestFile archive + liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) + +-- Fixup + +fixInternalReferences :: FilePath -> Pandoc -> Pandoc +fixInternalReferences pathToFile = + (walk $ renameImages root) + . (walk $ fixBlockIRs filename) + . (walk $ fixInlineIRs filename) + where + (root, escapeURI -> filename) = splitFileName pathToFile + +fixInlineIRs :: String -> Inline -> Inline +fixInlineIRs s (Span as v) = + Span (fixAttrs s as) v +fixInlineIRs s (Code as code) = + Code (fixAttrs s as) code +fixInlineIRs s (Link as is ('#':url, tit)) = + Link (fixAttrs s as) is (addHash s url, tit) +fixInlineIRs s (Link as is t) = + Link (fixAttrs s as) is t +fixInlineIRs _ v = v + +prependHash :: [String] -> Inline -> Inline +prependHash ps l@(Link attr is (url, tit)) + | or [s `isPrefixOf` url | s <- ps] = + Link attr is ('#':url, tit) + | otherwise = l +prependHash _ i = i + +fixBlockIRs :: String -> Block -> Block +fixBlockIRs s (Div as b) = + Div (fixAttrs s as) b +fixBlockIRs s (Header i as b) = + Header i (fixAttrs s as) b +fixBlockIRs s (CodeBlock as code) = + CodeBlock (fixAttrs s as) code +fixBlockIRs _ b = b + +fixAttrs :: FilePath -> B.Attr -> B.Attr +fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs) + +addHash :: String -> String -> String +addHash _ "" = "" +addHash s ident = takeFileName s ++ "#" ++ ident + +removeEPUBAttrs :: [(String, String)] -> [(String, String)] +removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs + +isEPUBAttr :: (String, String) -> Bool +isEPUBAttr (k, _) = "epub:" `isPrefixOf` k + +-- Library + +-- Strict version of foldM +foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a +foldM' _ z [] = return z +foldM' f z (x:xs) = do + z' <- f z x + z' `deepseq` foldM' f z' xs + +uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d +uncurry3 f (a, b, c) = f a b c + +-- Utility + +stripNamespace :: QName -> String +stripNamespace (QName v _ _) = v + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val) +attrToNSPair _ = Nothing + +attrToPair :: Attr -> (String, String) +attrToPair (Attr (QName name _ _) val) = (name, val) + +defaultNameSpace :: Maybe String +defaultNameSpace = Just "http://www.idpf.org/2007/opf" + +dfName :: String -> QName +dfName s = QName s defaultNameSpace Nothing + +emptyName :: String -> QName +emptyName s = QName s Nothing Nothing + +-- Convert Maybe interface to Either + +findAttrE :: PandocMonad m => QName -> Element -> m String +findAttrE q e = mkE "findAttr" $ findAttr q e + +findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry +findEntryByPathE (normalise -> path) a = + mkE ("No entry on path: " ++ path) $ findEntryByPath path a + +parseXMLDocE :: PandocMonad m => String -> m Element +parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc + +findElementE :: PandocMonad m => QName -> Element -> m Element +findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x + +mkE :: PandocMonad m => String -> Maybe a -> m a +mkE s = maybe (throwError . PandocParseError $ s) return diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs new file mode 100644 index 000000000..f02f1a1d4 --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -0,0 +1,1136 @@ +{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses, +ViewPatterns#-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.HTML + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of HTML to 'Pandoc' document. +-} +module Text.Pandoc.Readers.HTML ( readHtml + , htmlTag + , htmlInBalanced + , isInlineTag + , isBlockTag + , isTextTag + , isCommentTag + ) where + +import Text.HTML.TagSoup +import Text.HTML.TagSoup.Match +import Text.Pandoc.Definition +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) +import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField + , escapeURI, safeRead ) +import Text.Pandoc.Options (ReaderOptions(readerExtensions), extensionEnabled, + Extension (Ext_epub_html_exts, + Ext_raw_html, Ext_native_divs, Ext_native_spans)) +import Text.Pandoc.Logging +import Text.Pandoc.Parsing hiding ((<|>)) +import Text.Pandoc.Walk +import qualified Data.Map as M +import Data.Maybe ( fromMaybe, isJust) +import Data.List ( intercalate, isInfixOf, isPrefixOf ) +import Data.Char ( isDigit ) +import Control.Monad ( guard, mzero, void, unless ) +import Control.Arrow ((***)) +import Control.Applicative ( (<|>) ) +import Data.Monoid (First (..)) +import Text.TeXMath (readMathML, writeTeX) +import Data.Default (Default (..), def) +import Control.Monad.Reader (ask, asks, local, ReaderT, runReaderT, lift) +import Network.URI (URI, parseURIReference, nonStrictRelativeTo) +import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) +import Data.Monoid ((<>)) +import Text.Parsec.Error +import qualified Data.Set as Set +import Text.Pandoc.Error +import Text.Pandoc.Class (PandocMonad, report) +import Control.Monad.Except (throwError) + +-- | Convert HTML-formatted string to 'Pandoc' document. +readHtml :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assumes @'\n'@ line endings) + -> m Pandoc +readHtml opts inp = do + let tags = stripPrefixes . canonicalizeTags $ + parseTagsOptions parseOptions{ optTagPosition = True } inp + parseDoc = do + blocks <- (fixPlains False) . mconcat <$> manyTill block eof + meta <- stateMeta . parserState <$> getState + bs' <- replaceNotes (B.toList blocks) + return $ Pandoc meta bs' + getError (errorMessages -> ms) = case ms of + [] -> "" + (m:_) -> messageString m + result <- flip runReaderT def $ + runParserT parseDoc + (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty) + "source" tags + case result of + Right doc -> return doc + Left err -> throwError $ PandocParseError $ getError err + +replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] +replaceNotes = walkM replaceNotes' + +replaceNotes' :: PandocMonad m => Inline -> TagParser m Inline +replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes + where + getNotes = noteTable <$> getState +replaceNotes' x = return x + +data HTMLState = + HTMLState + { parserState :: ParserState, + noteTable :: [(String, Blocks)], + baseHref :: Maybe URI, + identifiers :: Set.Set String, + headerMap :: M.Map Inlines String + } + +data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext + , inChapter :: Bool -- ^ Set if in chapter section + , inPlain :: Bool -- ^ Set if in pPlain + } + +setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a +setInChapter = local (\s -> s {inChapter = True}) + +setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a +setInPlain = local (\s -> s {inPlain = True}) + +type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m) + +type TagParser m = HTMLParser m [Tag String] + +pBody :: PandocMonad m => TagParser m Blocks +pBody = pInTags "body" block + +pHead :: PandocMonad m => TagParser m Blocks +pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag) + where pTitle = pInTags "title" inline >>= setTitle . trimInlines + setTitle t = mempty <$ (updateState $ B.setMeta "title" t) + pMetaTag = do + mt <- pSatisfy (~== TagOpen "meta" []) + let name = fromAttrib "name" mt + if null name + then return mempty + else do + let content = fromAttrib "content" mt + updateState $ \s -> + let ps = parserState s in + s{ parserState = ps{ + stateMeta = addMetaField name (B.text content) + (stateMeta ps) } } + return mempty + pBaseTag = do + bt <- pSatisfy (~== TagOpen "base" []) + updateState $ \st -> st{ baseHref = + parseURIReference $ fromAttrib "href" bt } + return mempty + +block :: PandocMonad m => TagParser m Blocks +block = do + pos <- getPosition + res <- choice + [ eSection + , eSwitch B.para block + , mempty <$ eFootnote + , mempty <$ eTOC + , mempty <$ eTitlePage + , pPara + , pHeader + , pBlockQuote + , pCodeBlock + , pList + , pHrule + , pTable + , pHead + , pBody + , pDiv + , pPlain + , pRawHtmlBlock + ] + report $ ParsingTrace (take 60 $ show $ B.toList res) pos + return res + +namespaces :: PandocMonad m => [(String, TagParser m Inlines)] +namespaces = [(mathMLNamespace, pMath True)] + +mathMLNamespace :: String +mathMLNamespace = "http://www.w3.org/1998/Math/MathML" + +eSwitch :: (PandocMonad m, Monoid a) + => (Inlines -> a) + -> TagParser m a + -> TagParser m a +eSwitch constructor parser = try $ do + guardEnabled Ext_epub_html_exts + pSatisfy (~== TagOpen "switch" []) + cases <- getFirst . mconcat <$> + manyTill (First <$> (eCase <* skipMany pBlank) ) + (lookAhead $ try $ pSatisfy (~== TagOpen "default" [])) + skipMany pBlank + fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank) + skipMany pBlank + pSatisfy (~== TagClose "switch") + return $ maybe fallback constructor cases + +eCase :: PandocMonad m => TagParser m (Maybe Inlines) +eCase = do + skipMany pBlank + TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" []) + case (flip lookup namespaces) =<< lookup "required-namespace" attr of + Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank)) + Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case")) + +eFootnote :: PandocMonad m => TagParser m () +eFootnote = try $ do + let notes = ["footnote", "rearnote"] + guardEnabled Ext_epub_html_exts + (TagOpen tag attr) <- lookAhead $ pAnyTag + guard (maybe False (flip elem notes) (lookup "type" attr)) + let ident = fromMaybe "" (lookup "id" attr) + content <- pInTags tag block + addNote ident content + +addNote :: PandocMonad m => String -> Blocks -> TagParser m () +addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)}) + +eNoteref :: PandocMonad m => TagParser m Inlines +eNoteref = try $ do + guardEnabled Ext_epub_html_exts + TagOpen tag attr <- lookAhead $ pAnyTag + guard (maybe False (== "noteref") (lookup "type" attr)) + let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr) + guard (not (null ident)) + pInTags tag block + return $ B.rawInline "noteref" ident + +-- Strip TOC if there is one, better to generate again +eTOC :: PandocMonad m => TagParser m () +eTOC = try $ do + guardEnabled Ext_epub_html_exts + (TagOpen tag attr) <- lookAhead $ pAnyTag + guard (maybe False (== "toc") (lookup "type" attr)) + void (pInTags tag block) + +pList :: PandocMonad m => TagParser m Blocks +pList = pBulletList <|> pOrderedList <|> pDefinitionList + +pBulletList :: PandocMonad m => TagParser m Blocks +pBulletList = try $ do + pSatisfy (~== TagOpen "ul" []) + let nonItem = pSatisfy (\t -> + not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && + not (t ~== TagClose "ul")) + -- note: if they have an <ol> or <ul> not in scope of a <li>, + -- treat it as a list item, though it's not valid xhtml... + skipMany nonItem + items <- manyTill (pListItem nonItem) (pCloses "ul") + return $ B.bulletList $ map (fixPlains True) items + +pListItem :: PandocMonad m => TagParser m a -> TagParser m Blocks +pListItem nonItem = do + TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" []) + let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr) + (liDiv <>) <$> pInTags "li" block <* skipMany nonItem + +parseListStyleType :: String -> ListNumberStyle +parseListStyleType "lower-roman" = LowerRoman +parseListStyleType "upper-roman" = UpperRoman +parseListStyleType "lower-alpha" = LowerAlpha +parseListStyleType "upper-alpha" = UpperAlpha +parseListStyleType "decimal" = Decimal +parseListStyleType _ = DefaultStyle + +parseTypeAttr :: String -> ListNumberStyle +parseTypeAttr "i" = LowerRoman +parseTypeAttr "I" = UpperRoman +parseTypeAttr "a" = LowerAlpha +parseTypeAttr "A" = UpperAlpha +parseTypeAttr "1" = Decimal +parseTypeAttr _ = DefaultStyle + +pOrderedList :: PandocMonad m => TagParser m Blocks +pOrderedList = try $ do + TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) + let (start, style) = (sta', sty') + where sta = fromMaybe "1" $ + lookup "start" attribs + sta' = if all isDigit sta + then read sta + else 1 + + pickListStyle = pickStyleAttrProps ["list-style-type", "list-style"] + + typeAttr = fromMaybe "" $ lookup "type" attribs + classAttr = fromMaybe "" $ lookup "class" attribs + styleAttr = fromMaybe "" $ lookup "style" attribs + listStyle = fromMaybe "" $ pickListStyle styleAttr + + sty' = foldOrElse DefaultStyle + [ parseTypeAttr typeAttr + , parseListStyleType classAttr + , parseListStyleType listStyle + ] + let nonItem = pSatisfy (\t -> + not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && + not (t ~== TagClose "ol")) + -- note: if they have an <ol> or <ul> not in scope of a <li>, + -- treat it as a list item, though it's not valid xhtml... + skipMany nonItem + items <- manyTill (pListItem nonItem) (pCloses "ol") + return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items + +pDefinitionList :: PandocMonad m => TagParser m Blocks +pDefinitionList = try $ do + pSatisfy (~== TagOpen "dl" []) + items <- manyTill pDefListItem (pCloses "dl") + return $ B.definitionList items + +pDefListItem :: PandocMonad m => TagParser m (Inlines, [Blocks]) +pDefListItem = try $ do + let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) && + not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl")) + terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) + defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) + skipMany nonItem + let term = foldl1 (\x y -> x <> B.linebreak <> y) terms + return (term, map (fixPlains True) defs) + +fixPlains :: Bool -> Blocks -> Blocks +fixPlains inList bs = if any isParaish bs' + then B.fromList $ map plainToPara bs' + else bs + where isParaish (Para _) = True + isParaish (CodeBlock _ _) = True + isParaish (Header _ _ _) = True + isParaish (BlockQuote _) = True + isParaish (BulletList _) = not inList + isParaish (OrderedList _ _) = not inList + isParaish (DefinitionList _) = not inList + isParaish _ = False + plainToPara (Plain xs) = Para xs + plainToPara x = x + bs' = B.toList bs + +pRawTag :: PandocMonad m => TagParser m String +pRawTag = do + tag <- pAnyTag + let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"] + if tagOpen ignorable (const True) tag || tagClose ignorable tag + then return [] + else return $ renderTags' [tag] + +pDiv :: PandocMonad m => TagParser m Blocks +pDiv = try $ do + guardEnabled Ext_native_divs + let isDivLike "div" = True + isDivLike "section" = True + isDivLike _ = False + TagOpen tag attr <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) + contents <- pInTags tag block + let (ident, classes, kvs) = mkAttr attr + let classes' = if tag == "section" + then "section":classes + else classes + return $ B.divWith (ident, classes', kvs) contents + +pRawHtmlBlock :: PandocMonad m => TagParser m Blocks +pRawHtmlBlock = do + raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag + exts <- getOption readerExtensions + if extensionEnabled Ext_raw_html exts && not (null raw) + then return $ B.rawBlock "html" raw + else ignore raw + +ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a +ignore raw = do + pos <- getPosition + -- raw can be null for tags like <!DOCTYPE>; see paRawTag + -- in this case we don't want a warning: + unless (null raw) $ + report $ SkippedContent raw pos + return mempty + +pHtmlBlock :: PandocMonad m => String -> TagParser m String +pHtmlBlock t = try $ do + open <- pSatisfy (~== TagOpen t []) + contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) + return $ renderTags' $ [open] ++ contents ++ [TagClose t] + +-- Sets chapter context +eSection :: PandocMonad m => TagParser m Blocks +eSection = try $ do + let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as) + let sectTag = tagOpen (`elem` sectioningContent) matchChapter + TagOpen tag _ <- lookAhead $ pSatisfy sectTag + setInChapter (pInTags tag block) + +headerLevel :: PandocMonad m => String -> TagParser m Int +headerLevel tagtype = do + let level = read (drop 1 tagtype) + (try $ do + guardEnabled Ext_epub_html_exts + asks inChapter >>= guard + return (level - 1)) + <|> + return level + +eTitlePage :: PandocMonad m => TagParser m () +eTitlePage = try $ do + let isTitlePage as = maybe False (isInfixOf "titlepage") (lookup "type" as) + let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section") + isTitlePage + TagOpen tag _ <- lookAhead $ pSatisfy groupTag + () <$ pInTags tag block + +pHeader :: PandocMonad m => TagParser m Blocks +pHeader = try $ do + TagOpen tagtype attr <- pSatisfy $ + tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) + (const True) + let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] + level <- headerLevel tagtype + contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) + let ident = fromMaybe "" $ lookup "id" attr + let classes = maybe [] words $ lookup "class" attr + let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] + attr' <- registerHeader (ident, classes, keyvals) contents + return $ if bodyTitle + then mempty -- skip a representation of the title in the body + else B.headerWith attr' level contents + +pHrule :: PandocMonad m => TagParser m Blocks +pHrule = do + pSelfClosing (=="hr") (const True) + return B.horizontalRule + +pTable :: PandocMonad m => TagParser m Blocks +pTable = try $ do + TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) + skipMany pBlank + caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank + widths' <- (mconcat <$> many1 pColgroup) <|> many pCol + let pTh = option [] $ pInTags "tr" (pCell "th") + pTr = try $ skipMany pBlank >> pInTags "tr" (pCell "td" <|> pCell "th") + pTBody = do pOptInTag "tbody" $ many1 pTr + head'' <- pOptInTag "thead" pTh + head' <- pOptInTag "tbody" $ do + if null head'' + then pTh + else return head'' + rowsLs <- many pTBody + rows' <- pOptInTag "tfoot" $ many pTr + TagClose _ <- pSatisfy (~== TagClose "table") + let rows'' = (concat rowsLs) ++ rows' + -- fail on empty table + guard $ not $ null head' && null rows'' + let isSinglePlain x = case B.toList x of + [] -> True + [Plain _] -> True + _ -> False + let isSimple = all isSinglePlain $ concat (head':rows'') + let cols = length $ if null head' then head rows'' else head' + -- add empty cells to short rows + let addEmpties r = case cols - length r of + n | n > 0 -> r ++ replicate n mempty + | otherwise -> r + let rows = map addEmpties rows'' + let aligns = replicate cols AlignDefault + let widths = if null widths' + then if isSimple + then replicate cols 0 + else replicate cols (1.0 / fromIntegral cols) + else widths' + return $ B.table caption (zip aligns widths) head' rows + +pCol :: PandocMonad m => TagParser m Double +pCol = try $ do + TagOpen _ attribs <- pSatisfy (~== TagOpen "col" []) + skipMany pBlank + optional $ pSatisfy (~== TagClose "col") + skipMany pBlank + return $ case lookup "width" attribs of + Nothing -> case lookup "style" attribs of + Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs -> + fromMaybe 0.0 $ safeRead ('0':'.':filter + (`notElem` " \t\r\n%'\";") xs) + _ -> 0.0 + Just x | not (null x) && last x == '%' -> + fromMaybe 0.0 $ safeRead ('0':'.':init x) + _ -> 0.0 + +pColgroup :: PandocMonad m => TagParser m [Double] +pColgroup = try $ do + pSatisfy (~== TagOpen "colgroup" []) + skipMany pBlank + manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank + +noColOrRowSpans :: Tag String -> Bool +noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan" + where isNullOrOne x = case fromAttrib x t of + "" -> True + "1" -> True + _ -> False + +pCell :: PandocMonad m => String -> TagParser m [Blocks] +pCell celltype = try $ do + skipMany pBlank + res <- pInTags' celltype noColOrRowSpans block + skipMany pBlank + return [res] + +pBlockQuote :: PandocMonad m => TagParser m Blocks +pBlockQuote = do + contents <- pInTags "blockquote" block + return $ B.blockQuote $ fixPlains False contents + +pPlain :: PandocMonad m => TagParser m Blocks +pPlain = do + contents <- setInPlain $ trimInlines . mconcat <$> many1 inline + if B.isNull contents + then return mempty + else return $ B.plain contents + +pPara :: PandocMonad m => TagParser m Blocks +pPara = do + contents <- trimInlines <$> pInTags "p" inline + return $ B.para contents + +pCodeBlock :: PandocMonad m => TagParser m Blocks +pCodeBlock = try $ do + TagOpen _ attr <- pSatisfy (~== TagOpen "pre" []) + contents <- manyTill pAnyTag (pCloses "pre" <|> eof) + let rawText = concatMap tagToString contents + -- drop leading newline if any + let result' = case rawText of + '\n':xs -> xs + _ -> rawText + -- drop trailing newline if any + let result = case reverse result' of + '\n':_ -> init result' + _ -> result' + return $ B.codeBlockWith (mkAttr attr) result + +tagToString :: Tag String -> String +tagToString (TagText s) = s +tagToString (TagOpen "br" _) = "\n" +tagToString _ = "" + +inline :: PandocMonad m => TagParser m Inlines +inline = choice + [ eNoteref + , eSwitch id inline + , pTagText + , pQ + , pEmph + , pStrong + , pSuperscript + , pSubscript + , pStrikeout + , pLineBreak + , pLink + , pImage + , pCode + , pSpan + , pMath False + , pRawHtmlInline + ] + +pLocation :: PandocMonad m => TagParser m () +pLocation = do + (TagPosition r c) <- pSat isTagPosition + setPosition $ newPos "input" r c + +pSat :: PandocMonad m => (Tag String -> Bool) -> TagParser m (Tag String) +pSat f = do + pos <- getPosition + token show (const pos) (\x -> if f x then Just x else Nothing) + +pSatisfy :: PandocMonad m => (Tag String -> Bool) -> TagParser m (Tag String) +pSatisfy f = try $ optional pLocation >> pSat f + +pAnyTag :: PandocMonad m => TagParser m (Tag String) +pAnyTag = pSatisfy (const True) + +pSelfClosing :: PandocMonad m + => (String -> Bool) -> ([Attribute String] -> Bool) + -> TagParser m (Tag String) +pSelfClosing f g = do + open <- pSatisfy (tagOpen f g) + optional $ pSatisfy (tagClose f) + return open + +pQ :: PandocMonad m => TagParser m Inlines +pQ = do + context <- asks quoteContext + let quoteType = case context of + InDoubleQuote -> SingleQuote + _ -> DoubleQuote + let innerQuoteContext = if quoteType == SingleQuote + then InSingleQuote + else InDoubleQuote + let constructor = case quoteType of + SingleQuote -> B.singleQuoted + DoubleQuote -> B.doubleQuoted + withQuoteContext innerQuoteContext $ + pInlinesInTags "q" constructor + +pEmph :: PandocMonad m => TagParser m Inlines +pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph + +pStrong :: PandocMonad m => TagParser m Inlines +pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong + +pSuperscript :: PandocMonad m => TagParser m Inlines +pSuperscript = pInlinesInTags "sup" B.superscript + +pSubscript :: PandocMonad m => TagParser m Inlines +pSubscript = pInlinesInTags "sub" B.subscript + +pStrikeout :: PandocMonad m => TagParser m Inlines +pStrikeout = do + pInlinesInTags "s" B.strikeout <|> + pInlinesInTags "strike" B.strikeout <|> + pInlinesInTags "del" B.strikeout <|> + try (do pSatisfy (~== TagOpen "span" [("class","strikeout")]) + contents <- mconcat <$> manyTill inline (pCloses "span") + return $ B.strikeout contents) + +pLineBreak :: PandocMonad m => TagParser m Inlines +pLineBreak = do + pSelfClosing (=="br") (const True) + return B.linebreak + +-- Unlike fromAttrib from tagsoup, this distinguishes +-- between a missing attribute and an attribute with empty content. +maybeFromAttrib :: String -> Tag String -> Maybe String +maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs +maybeFromAttrib _ _ = Nothing + +pLink :: PandocMonad m => TagParser m Inlines +pLink = try $ do + tag <- pSatisfy $ tagOpenLit "a" (const True) + let title = fromAttrib "title" tag + -- take id from id attribute if present, otherwise name + let uid = maybe (fromAttrib "name" tag) id $ maybeFromAttrib "id" tag + let cls = words $ fromAttrib "class" tag + lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a") + -- check for href; if href, then a link, otherwise a span + case maybeFromAttrib "href" tag of + Nothing -> + return $ B.spanWith (uid, cls, []) lab + Just url' -> do + mbBaseHref <- baseHref <$> getState + let url = case (parseURIReference url', mbBaseHref) of + (Just rel, Just bs) -> + show (rel `nonStrictRelativeTo` bs) + _ -> url' + return $ B.linkWith (uid, cls, []) (escapeURI url) title lab + +pImage :: PandocMonad m => TagParser m Inlines +pImage = do + tag <- pSelfClosing (=="img") (isJust . lookup "src") + mbBaseHref <- baseHref <$> getState + let url' = fromAttrib "src" tag + let url = case (parseURIReference url', mbBaseHref) of + (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) + _ -> url' + let title = fromAttrib "title" tag + let alt = fromAttrib "alt" tag + let uid = fromAttrib "id" tag + let cls = words $ fromAttrib "class" tag + let getAtt k = case fromAttrib k tag of + "" -> [] + v -> [(k, v)] + let kvs = concat $ map getAtt ["width", "height", "sizes", "srcset"] + return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt) + +pCode :: PandocMonad m => TagParser m Inlines +pCode = try $ do + (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) + result <- manyTill pAnyTag (pCloses open) + return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result + +pSpan :: PandocMonad m => TagParser m Inlines +pSpan = try $ do + guardEnabled Ext_native_spans + TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) + contents <- pInTags "span" inline + let isSmallCaps = fontVariant == "small-caps" + where styleAttr = fromMaybe "" $ lookup "style" attr + fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr + let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr) + return $ tag contents + +pRawHtmlInline :: PandocMonad m => TagParser m Inlines +pRawHtmlInline = do + inplain <- asks inPlain + result <- pSatisfy (tagComment (const True)) + <|> if inplain + then pSatisfy (not . isBlockTag) + else pSatisfy isInlineTag + exts <- getOption readerExtensions + let raw = renderTags' [result] + if extensionEnabled Ext_raw_html exts + then return $ B.rawInline "html" raw + else ignore raw + +mathMLToTeXMath :: String -> Either String String +mathMLToTeXMath s = writeTeX <$> readMathML s + +pMath :: PandocMonad m => Bool -> TagParser m Inlines +pMath inCase = try $ do + open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True) + -- we'll assume math tags are MathML unless specially marked + -- otherwise... + unless inCase $ + guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr)) + contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math")) + case mathMLToTeXMath (renderTags $ [open] ++ contents ++ [TagClose "math"]) of + Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $ + innerText contents + Right [] -> return mempty + Right x -> return $ case lookup "display" attr of + Just "block" -> B.displayMath x + _ -> B.math x + +pInlinesInTags :: PandocMonad m => String -> (Inlines -> Inlines) + -> TagParser m Inlines +pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline + +pInTags :: (PandocMonad m, Monoid a) => String -> TagParser m a -> TagParser m a +pInTags tagtype parser = pInTags' tagtype (const True) parser + +pInTags' :: (PandocMonad m, Monoid a) + => String + -> (Tag String -> Bool) + -> TagParser m a + -> TagParser m a +pInTags' tagtype tagtest parser = try $ do + pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t) + mconcat <$> manyTill parser (pCloses tagtype <|> eof) + +-- parses p, preceeded by an optional opening tag +-- and followed by an optional closing tags +pOptInTag :: PandocMonad m => String -> TagParser m a -> TagParser m a +pOptInTag tagtype p = try $ do + skipMany pBlank + optional $ pSatisfy (~== TagOpen tagtype []) + skipMany pBlank + x <- p + skipMany pBlank + optional $ pSatisfy (~== TagClose tagtype) + skipMany pBlank + return x + +pCloses :: PandocMonad m => String -> TagParser m () +pCloses tagtype = try $ do + t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag + case t of + (TagClose t') | t' == tagtype -> pAnyTag >> return () + (TagOpen t' _) | t' `closes` tagtype -> return () + (TagClose "ul") | tagtype == "li" -> return () + (TagClose "ol") | tagtype == "li" -> return () + (TagClose "dl") | tagtype == "dd" -> return () + (TagClose "table") | tagtype == "td" -> return () + (TagClose "table") | tagtype == "tr" -> return () + _ -> mzero + +pTagText :: PandocMonad m => TagParser m Inlines +pTagText = try $ do + (TagText str) <- pSatisfy isTagText + st <- getState + qu <- ask + parsed <- lift $ lift $ + flip runReaderT qu $ runParserT (many pTagContents) st "text" str + case parsed of + Left _ -> throwError $ PandocParseError $ "Could not parse `" ++ str ++ "'" + Right result -> return $ mconcat result + +pBlank :: PandocMonad m => TagParser m () +pBlank = try $ do + (TagText str) <- pSatisfy isTagText + guard $ all isSpace str + +type InlinesParser m = HTMLParser m String + +pTagContents :: PandocMonad m => InlinesParser m Inlines +pTagContents = + B.displayMath <$> mathDisplay + <|> B.math <$> mathInline + <|> pStr + <|> pSpace + <|> smartPunctuation pTagContents + <|> pSymbol + <|> pBad + +pStr :: PandocMonad m => InlinesParser m Inlines +pStr = do + result <- many1 $ satisfy $ \c -> + not (isSpace c) && not (isSpecial c) && not (isBad c) + updateLastStrPos + return $ B.str result + +isSpecial :: Char -> Bool +isSpecial '"' = True +isSpecial '\'' = True +isSpecial '.' = True +isSpecial '-' = True +isSpecial '$' = True +isSpecial '\8216' = True +isSpecial '\8217' = True +isSpecial '\8220' = True +isSpecial '\8221' = True +isSpecial _ = False + +pSymbol :: PandocMonad m => InlinesParser m Inlines +pSymbol = satisfy isSpecial >>= return . B.str . (:[]) + +isBad :: Char -> Bool +isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML + +pBad :: PandocMonad m => InlinesParser m Inlines +pBad = do + c <- satisfy isBad + let c' = case c of + '\128' -> '\8364' + '\130' -> '\8218' + '\131' -> '\402' + '\132' -> '\8222' + '\133' -> '\8230' + '\134' -> '\8224' + '\135' -> '\8225' + '\136' -> '\710' + '\137' -> '\8240' + '\138' -> '\352' + '\139' -> '\8249' + '\140' -> '\338' + '\142' -> '\381' + '\145' -> '\8216' + '\146' -> '\8217' + '\147' -> '\8220' + '\148' -> '\8221' + '\149' -> '\8226' + '\150' -> '\8211' + '\151' -> '\8212' + '\152' -> '\732' + '\153' -> '\8482' + '\154' -> '\353' + '\155' -> '\8250' + '\156' -> '\339' + '\158' -> '\382' + '\159' -> '\376' + _ -> '?' + return $ B.str [c'] + +pSpace :: PandocMonad m => InlinesParser m Inlines +pSpace = many1 (satisfy isSpace) >>= \xs -> + if '\n' `elem` xs + then return B.softbreak + else return B.space + +-- +-- Constants +-- + +eitherBlockOrInline :: [String] +eitherBlockOrInline = ["audio", "applet", "button", "iframe", "embed", + "del", "ins", + "progress", "map", "area", "noscript", "script", + "object", "svg", "video", "source"] + +{- +inlineHtmlTags :: [[Char]] +inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", + "br", "cite", "code", "dfn", "em", "font", "i", "img", + "input", "kbd", "label", "q", "s", "samp", "select", + "small", "span", "strike", "strong", "sub", "sup", + "textarea", "tt", "u", "var"] +-} + +blockHtmlTags :: [String] +blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside", + "blockquote", "body", "button", "canvas", + "caption", "center", "col", "colgroup", "dd", "dir", "div", + "dl", "dt", "fieldset", "figcaption", "figure", + "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "head", "header", "hgroup", "hr", "html", + "isindex", "menu", "noframes", "ol", "output", "p", "pre", + "section", "table", "tbody", "textarea", + "thead", "tfoot", "ul", "dd", + "dt", "frameset", "li", "tbody", "td", "tfoot", + "th", "thead", "tr", "script", "style"] + +-- We want to allow raw docbook in markdown documents, so we +-- include docbook block tags here too. +blockDocBookTags :: [String] +blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist", + "orderedlist", "segmentedlist", "simplelist", + "variablelist", "caution", "important", "note", "tip", + "warning", "address", "literallayout", "programlisting", + "programlistingco", "screen", "screenco", "screenshot", + "synopsis", "example", "informalexample", "figure", + "informalfigure", "table", "informaltable", "para", + "simpara", "formalpara", "equation", "informalequation", + "figure", "screenshot", "mediaobject", "qandaset", + "procedure", "task", "cmdsynopsis", "funcsynopsis", + "classsynopsis", "blockquote", "epigraph", "msgset", + "sidebar", "title"] + +epubTags :: [String] +epubTags = ["case", "switch", "default"] + +blockTags :: [String] +blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags + +isInlineTag :: Tag String -> Bool +isInlineTag t = tagOpen isInlineTagName (const True) t || + tagClose isInlineTagName t || + tagComment (const True) t + where isInlineTagName x = x `notElem` blockTags + +isBlockTag :: Tag String -> Bool +isBlockTag t = tagOpen isBlockTagName (const True) t || + tagClose isBlockTagName t || + tagComment (const True) t + where isBlockTagName ('?':_) = True + isBlockTagName ('!':_) = True + isBlockTagName x = x `elem` blockTags + || x `elem` eitherBlockOrInline + +isTextTag :: Tag String -> Bool +isTextTag = tagText (const True) + +isCommentTag :: Tag String -> Bool +isCommentTag = tagComment (const True) + +-- taken from HXT and extended +-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags +closes :: String -> String -> Bool +_ `closes` "body" = False +_ `closes` "html" = False +"body" `closes` "head" = True +"a" `closes` "a" = True +"li" `closes` "li" = True +"th" `closes` t | t `elem` ["th","td"] = True +"tr" `closes` t | t `elem` ["th","td","tr"] = True +"dd" `closes` t | t `elem` ["dt", "dd"] = True +"dt" `closes` t | t `elem` ["dt","dd"] = True +"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True +"optgroup" `closes` "optgroup" = True +"optgroup" `closes` "option" = True +"option" `closes` "option" = True +-- http://www.w3.org/TR/html-markup/p.html +x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote", + "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section", + "table", "ul"] = True +"meta" `closes` "meta" = True +"form" `closes` "form" = True +"label" `closes` "label" = True +"map" `closes` "map" = True +"object" `closes` "object" = True +_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True +t `closes` "select" | t /= "option" = True +"thead" `closes` t | t `elem` ["colgroup"] = True +"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True +"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True +t `closes` t2 | + t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","p"] && + t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" +t1 `closes` t2 | + t1 `elem` blockTags && + t2 `notElem` (blockTags ++ eitherBlockOrInline) = True +_ `closes` _ = False + +--- parsers for use in markdown, textile readers + +-- | Matches a stretch of HTML in balanced tags. +htmlInBalanced :: (Monad m) + => (Tag String -> Bool) + -> ParserT String st m String +htmlInBalanced f = try $ do + lookAhead (char '<') + inp <- getInput + let ts = canonicalizeTags $ + parseTagsOptions parseOptions{ optTagWarning = True, + optTagPosition = True } inp + case ts of + (TagPosition sr sc : t@(TagOpen tn _) : rest) -> do + guard $ f t + guard $ not $ hasTagWarning (t : take 1 rest) + case htmlInBalanced' tn (t:rest) of + [] -> mzero + xs -> case reverse xs of + (TagClose _ : TagPosition er ec : _) -> do + let ls = er - sr + let cs = ec - sc + lscontents <- unlines <$> count ls anyLine + cscontents <- count cs anyChar + (_,closetag) <- htmlTag (~== TagClose tn) + return (lscontents ++ cscontents ++ closetag) + _ -> mzero + _ -> mzero + +htmlInBalanced' :: String + -> [Tag String] + -> [Tag String] +htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts + where go :: Int -> [Tag String] -> Maybe [Tag String] + go n (t@(TagOpen tn' _):rest) | tn' == tagname = + (t :) <$> go (n + 1) rest + go 1 (t@(TagClose tn'):_) | tn' == tagname = + return [t] + go n (t@(TagClose tn'):rest) | tn' == tagname = + (t :) <$> go (n - 1) rest + go n (t:ts') = (t :) <$> go n ts' + go _ [] = mzero + +hasTagWarning :: [Tag String] -> Bool +hasTagWarning (TagWarning _:_) = True +hasTagWarning _ = False + +-- | Matches a tag meeting a certain condition. +htmlTag :: Monad m + => (Tag String -> Bool) + -> ParserT [Char] st m (Tag String, String) +htmlTag f = try $ do + lookAhead (char '<') + inp <- getInput + let (next : _) = canonicalizeTags $ parseTagsOptions + parseOptions{ optTagWarning = False } inp + guard $ f next + let handleTag tagname = do + -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66> + -- should NOT be parsed as an HTML tag, see #2277 + guard $ not ('.' `elem` tagname) + -- <https://example.org> should NOT be a tag either. + -- tagsoup will parse it as TagOpen "https:" [("example.org","")] + guard $ not (null tagname) + guard $ last tagname /= ':' + rendered <- manyTill anyChar (char '>') + return (next, rendered ++ ">") + case next of + TagComment s + | "<!--" `isPrefixOf` inp -> do + count (length s + 4) anyChar + skipMany (satisfy (/='>')) + char '>' + return (next, "<!--" ++ s ++ "-->") + | otherwise -> fail "bogus comment mode, HTML5 parse error" + TagOpen tagname _attr -> handleTag tagname + TagClose tagname -> handleTag tagname + _ -> mzero + +mkAttr :: [(String, String)] -> Attr +mkAttr attr = (attribsId, attribsClasses, attribsKV) + where attribsId = fromMaybe "" $ lookup "id" attr + attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes + attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr + epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr + +-- Strip namespace prefixes +stripPrefixes :: [Tag String] -> [Tag String] +stripPrefixes = map stripPrefix + +stripPrefix :: Tag String -> Tag String +stripPrefix (TagOpen s as) = + TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as) +stripPrefix (TagClose s) = TagClose (stripPrefix' s) +stripPrefix x = x + +stripPrefix' :: String -> String +stripPrefix' s = + case span (/= ':') s of + (_, "") -> s + (_, (_:ts)) -> ts + +isSpace :: Char -> Bool +isSpace ' ' = True +isSpace '\t' = True +isSpace '\n' = True +isSpace '\r' = True +isSpace _ = False + +-- Instances + +instance HasIdentifierList HTMLState where + extractIdentifierList = identifiers + updateIdentifierList f s = s{ identifiers = f (identifiers s) } + +instance HasHeaderMap HTMLState where + extractHeaderMap = headerMap + updateHeaderMap f s = s{ headerMap = f (headerMap s) } + +-- This signature should be more general +-- MonadReader HTMLLocal m => HasQuoteContext st m +instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where + getQuoteContext = asks quoteContext + withQuoteContext q = local (\s -> s{quoteContext = q}) + +instance HasReaderOptions HTMLState where + extractReaderOptions = extractReaderOptions . parserState + +instance HasMeta HTMLState where + setMeta s b st = st {parserState = setMeta s b $ parserState st} + deleteMeta s st = st {parserState = deleteMeta s $ parserState st} + +instance Default HTMLLocal where + def = HTMLLocal NoQuote False False + +instance HasLastStrPosition HTMLState where + setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} + getLastStrPos = getLastStrPos . parserState + + +-- EPUB Specific +-- +-- +sectioningContent :: [String] +sectioningContent = ["article", "aside", "nav", "section"] + + +groupingContent :: [String] +groupingContent = ["p", "hr", "pre", "blockquote", "ol" + , "ul", "li", "dl", "dt", "dt", "dd" + , "figure", "figcaption", "div", "main"] + + +{- + +types :: [(String, ([String], Int))] +types = -- Document divisions + map (\s -> (s, (["section", "body"], 0))) + ["volume", "part", "chapter", "division"] + ++ -- Document section and components + [ + ("abstract", ([], 0))] +-} diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs new file mode 100644 index 000000000..310a04574 --- /dev/null +++ b/src/Text/Pandoc/Readers/Haddock.hs @@ -0,0 +1,160 @@ +{-# LANGUAGE CPP #-} +{- | + Module : Text.Pandoc.Readers.Haddock + Copyright : Copyright (C) 2013 David Lazar + License : GNU GPL, version 2 or above + + Maintainer : David Lazar <lazar6@illinois.edu>, + John MacFarlane <jgm@berkeley.edu> + Stability : alpha + +Conversion of Haddock markup to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Haddock + ( readHaddock + ) where + +import Text.Pandoc.Builder (Blocks, Inlines) +import qualified Text.Pandoc.Builder as B +import Data.Monoid ((<>)) +import Text.Pandoc.Shared (trim, splitBy) +import Data.List (intersperse, stripPrefix) +import Data.Maybe (fromMaybe) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Documentation.Haddock.Parser +import Documentation.Haddock.Types +import Text.Pandoc.Error +import Control.Monad.Except (throwError) +import Text.Pandoc.Class (PandocMonad) + + +-- | Parse Haddock markup and return a 'Pandoc' document. +readHaddock :: PandocMonad m + => ReaderOptions + -> String + -> m Pandoc +readHaddock opts s = case readHaddockEither opts s of + Right result -> return result + Left e -> throwError e + +readHaddockEither :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse + -> Either PandocError Pandoc +readHaddockEither _opts = +#if MIN_VERSION_haddock_library(1,2,0) + Right . B.doc . docHToBlocks . _doc . parseParas +#else + Right . B.doc . docHToBlocks . parseParas +#endif + +docHToBlocks :: DocH String Identifier -> Blocks +docHToBlocks d' = + case d' of + DocEmpty -> mempty + DocAppend (DocParagraph (DocHeader h)) (DocParagraph (DocAName ident)) -> + B.headerWith (ident,[],[]) (headerLevel h) + (docHToInlines False $ headerTitle h) + DocAppend d1 d2 -> mappend (docHToBlocks d1) (docHToBlocks d2) + DocString _ -> inlineFallback + DocParagraph (DocAName h) -> B.plain $ docHToInlines False $ DocAName h + DocParagraph x -> B.para $ docHToInlines False x + DocIdentifier _ -> inlineFallback + DocIdentifierUnchecked _ -> inlineFallback + DocModule s -> B.plain $ docHToInlines False $ DocModule s + DocWarning _ -> mempty -- TODO + DocEmphasis _ -> inlineFallback + DocMonospaced _ -> inlineFallback + DocBold _ -> inlineFallback +#if MIN_VERSION_haddock_library(1,4,0) + DocMathInline _ -> inlineFallback + DocMathDisplay _ -> inlineFallback +#endif + DocHeader h -> B.header (headerLevel h) + (docHToInlines False $ headerTitle h) + DocUnorderedList items -> B.bulletList (map docHToBlocks items) + DocOrderedList items -> B.orderedList (map docHToBlocks items) + DocDefList items -> B.definitionList (map (\(d,t) -> + (docHToInlines False d, + [consolidatePlains $ docHToBlocks t])) items) + DocCodeBlock (DocString s) -> B.codeBlockWith ("",[],[]) s + DocCodeBlock d -> B.para $ docHToInlines True d + DocHyperlink _ -> inlineFallback + DocPic _ -> inlineFallback + DocAName _ -> inlineFallback + DocProperty s -> B.codeBlockWith ("",["property","haskell"],[]) (trim s) + DocExamples es -> mconcat $ map (\e -> + makeExample ">>>" (exampleExpression e) (exampleResult e)) es + + where inlineFallback = B.plain $ docHToInlines False d' + consolidatePlains = B.fromList . consolidatePlains' . B.toList + consolidatePlains' zs@(Plain _ : _) = + let (xs, ys) = span isPlain zs in + Para (concatMap extractContents xs) : consolidatePlains' ys + consolidatePlains' (x : xs) = x : consolidatePlains' xs + consolidatePlains' [] = [] + isPlain (Plain _) = True + isPlain _ = False + extractContents (Plain xs) = xs + extractContents _ = [] + +docHToInlines :: Bool -> DocH String Identifier -> Inlines +docHToInlines isCode d' = + case d' of + DocEmpty -> mempty + DocAppend d1 d2 -> mappend (docHToInlines isCode d1) + (docHToInlines isCode d2) + DocString s + | isCode -> mconcat $ intersperse B.linebreak + $ map B.code $ splitBy (=='\n') s + | otherwise -> B.text s + DocParagraph _ -> mempty + DocIdentifier (_,s,_) -> B.codeWith ("",["haskell","identifier"],[]) s + DocIdentifierUnchecked s -> B.codeWith ("",["haskell","identifier"],[]) s + DocModule s -> B.codeWith ("",["haskell","module"],[]) s + DocWarning _ -> mempty -- TODO + DocEmphasis d -> B.emph (docHToInlines isCode d) + DocMonospaced (DocString s) -> B.code s + DocMonospaced d -> docHToInlines True d + DocBold d -> B.strong (docHToInlines isCode d) +#if MIN_VERSION_haddock_library(1,4,0) + DocMathInline s -> B.math s + DocMathDisplay s -> B.displayMath s +#endif + DocHeader _ -> mempty + DocUnorderedList _ -> mempty + DocOrderedList _ -> mempty + DocDefList _ -> mempty + DocCodeBlock _ -> mempty + DocHyperlink h -> B.link (hyperlinkUrl h) (hyperlinkUrl h) + (maybe (B.text $ hyperlinkUrl h) B.text $ hyperlinkLabel h) + DocPic p -> B.image (pictureUri p) (fromMaybe (pictureUri p) $ pictureTitle p) + (maybe mempty B.text $ pictureTitle p) + DocAName s -> B.spanWith (s,["anchor"],[]) mempty + DocProperty _ -> mempty + DocExamples _ -> mempty + +-- | Create an 'Example', stripping superfluous characters as appropriate +makeExample :: String -> String -> [String] -> Blocks +makeExample prompt expression result = + B.para $ B.codeWith ("",["prompt"],[]) prompt + <> B.space + <> B.codeWith ([], ["haskell","expr"], []) (trim expression) + <> B.linebreak + <> (mconcat $ intersperse B.linebreak $ map coder result') + where + -- 1. drop trailing whitespace from the prompt, remember the prefix + prefix = takeWhile (`elem` " \t") prompt + + -- 2. drop, if possible, the exact same sequence of whitespace + -- characters from each result line + -- + -- 3. interpret lines that only contain the string "<BLANKLINE>" as an + -- empty line + result' = map (substituteBlankLine . tryStripPrefix prefix) result + where + tryStripPrefix xs ys = fromMaybe ys $ stripPrefix xs ys + + substituteBlankLine "<BLANKLINE>" = "" + substituteBlankLine line = line + coder = B.codeWith ([], ["result"], []) diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs new file mode 100644 index 000000000..9f9a79535 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -0,0 +1,1437 @@ +{-# LANGUAGE ScopedTypeVariables, OverloadedStrings #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.LaTeX + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of LaTeX to 'Pandoc' document. +-} +module Text.Pandoc.Readers.LaTeX ( readLaTeX, + rawLaTeXInline, + rawLaTeXBlock, + inlineCommand, + ) where + +import Text.Pandoc.Definition +import Text.Pandoc.Walk +import Text.Pandoc.Shared +import Text.Pandoc.Options +import Text.Pandoc.Logging +import Text.Pandoc.Parsing hiding ((<|>), many, optional, space, + mathDisplay, mathInline) +import Data.Char ( chr, ord, isLetter, isAlphaNum ) +import Control.Monad +import Text.Pandoc.Builder +import Control.Applicative ((<|>), many, optional) +import Data.Maybe (fromMaybe, maybeToList) +import System.FilePath (replaceExtension, takeExtension, addExtension) +import Data.List (intercalate) +import qualified Data.Map as M +import Text.Pandoc.Highlighting (fromListingsLanguage, languagesByExtension) +import Text.Pandoc.ImageSize (numUnit, showFl) +import Control.Monad.Except (throwError) +import Text.Pandoc.Class (PandocMonad, PandocPure, lookupEnv, report, + readFileFromDirs) + +-- | Parse LaTeX from string and return 'Pandoc' document. +readLaTeX :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assumes @'\n'@ line endings) + -> m Pandoc +readLaTeX opts ltx = do + parsed <- readWithM parseLaTeX def{ stateOptions = opts } ltx + case parsed of + Right result -> return result + Left e -> throwError e + +parseLaTeX :: PandocMonad m => LP m Pandoc +parseLaTeX = do + bs <- blocks + eof + st <- getState + let meta = stateMeta st + let (Pandoc _ bs') = doc bs + return $ Pandoc meta bs' + +type LP m = ParserT String ParserState m + +anyControlSeq :: PandocMonad m => LP m String +anyControlSeq = do + char '\\' + next <- option '\n' anyChar + case next of + '\n' -> return "" + c | isLetter c -> (c:) <$> (many letter <* optional sp) + | otherwise -> return [c] + +controlSeq :: PandocMonad m => String -> LP m String +controlSeq name = try $ do + char '\\' + case name of + "" -> mzero + [c] | not (isLetter c) -> string [c] + cs -> string cs <* notFollowedBy letter <* optional sp + return name + +dimenarg :: PandocMonad m => LP m String +dimenarg = try $ do + ch <- option "" $ string "=" + num <- many1 digit + dim <- oneOfStrings ["pt","pc","in","bp","cm","mm","dd","cc","sp"] + return $ ch ++ num ++ dim + +sp :: PandocMonad m => LP m () +sp = whitespace <|> endline + +whitespace :: PandocMonad m => LP m () +whitespace = skipMany1 $ satisfy (\c -> c == ' ' || c == '\t') + +endline :: PandocMonad m => LP m () +endline = try (newline >> lookAhead anyChar >> notFollowedBy blankline) + +isLowerHex :: Char -> Bool +isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f' + +tildeEscape :: PandocMonad m => LP m Char +tildeEscape = try $ do + string "^^" + c <- satisfy (\x -> x >= '\0' && x <= '\128') + d <- if isLowerHex c + then option "" $ count 1 (satisfy isLowerHex) + else return "" + if null d + then case ord c of + x | x >= 64 && x <= 127 -> return $ chr (x - 64) + | otherwise -> return $ chr (x + 64) + else return $ chr $ read ('0':'x':c:d) + +comment :: PandocMonad m => LP m () +comment = do + char '%' + skipMany (satisfy (/='\n')) + optional newline + return () + +bgroup :: PandocMonad m => LP m () +bgroup = try $ do + skipMany (spaceChar <|> try (newline <* notFollowedBy blankline)) + () <$ char '{' + <|> () <$ controlSeq "bgroup" + <|> () <$ controlSeq "begingroup" + +egroup :: PandocMonad m => LP m () +egroup = () <$ char '}' + <|> () <$ controlSeq "egroup" + <|> () <$ controlSeq "endgroup" + +grouped :: PandocMonad m => Monoid a => LP m a -> LP m a +grouped parser = try $ bgroup *> (mconcat <$> manyTill parser egroup) + +braced :: PandocMonad m => LP m String +braced = bgroup *> (concat <$> manyTill + ( many1 (satisfy (\c -> c /= '\\' && c /= '}' && c /= '{')) + <|> try (string "\\}") + <|> try (string "\\{") + <|> try (string "\\\\") + <|> ((\x -> "{" ++ x ++ "}") <$> braced) + <|> count 1 anyChar + ) egroup) + +bracketed :: PandocMonad m => Monoid a => LP m a -> LP m a +bracketed parser = try $ char '[' *> (mconcat <$> manyTill parser (char ']')) + +mathDisplay :: PandocMonad m => LP m String -> LP m Inlines +mathDisplay p = displayMath <$> (try p >>= applyMacros' . trim) + +mathInline :: PandocMonad m => LP m String -> LP m Inlines +mathInline p = math <$> (try p >>= applyMacros') + +mathChars :: PandocMonad m => LP m String +mathChars = + concat <$> many (escapedChar + <|> (snd <$> withRaw braced) + <|> many1 (satisfy isOrdChar)) + where escapedChar = try $ do char '\\' + c <- anyChar + return ['\\',c] + isOrdChar '$' = False + isOrdChar '{' = False + isOrdChar '}' = False + isOrdChar '\\' = False + isOrdChar _ = True + +quoted' :: PandocMonad m => (Inlines -> Inlines) -> LP m String -> LP m () -> LP m Inlines +quoted' f starter ender = do + startchs <- starter + smart <- extensionEnabled Ext_smart <$> getOption readerExtensions + if smart + then do + ils <- many (notFollowedBy ender >> inline) + (ender >> return (f (mconcat ils))) <|> + (<> mconcat ils) <$> + lit (case startchs of + "``" -> "“" + "`" -> "‘" + _ -> startchs) + else lit startchs + +doubleQuote :: PandocMonad m => LP m Inlines +doubleQuote = do + quoted' doubleQuoted (try $ string "``") (void $ try $ string "''") + <|> quoted' doubleQuoted (string "“") (void $ char '”') + -- the following is used by babel for localized quotes: + <|> quoted' doubleQuoted (try $ string "\"`") (void $ try $ string "\"'") + <|> quoted' doubleQuoted (string "\"") (void $ char '"') + +singleQuote :: PandocMonad m => LP m Inlines +singleQuote = do + smart <- extensionEnabled Ext_smart <$> getOption readerExtensions + if smart + then quoted' singleQuoted (string "`") (try $ char '\'' >> notFollowedBy letter) + <|> quoted' singleQuoted (string "‘") (try $ char '’' >> notFollowedBy letter) + else str <$> many1 (oneOf "`\'‘’") + +inline :: PandocMonad m => LP m Inlines +inline = (mempty <$ comment) + <|> (space <$ whitespace) + <|> (softbreak <$ endline) + <|> inlineText + <|> inlineCommand + <|> inlineEnvironment + <|> inlineGroup + <|> (char '-' *> option (str "-") + (char '-' *> option (str "–") (str "—" <$ char '-'))) + <|> doubleQuote + <|> singleQuote + <|> (str "”" <$ try (string "''")) + <|> (str "”" <$ char '”') + <|> (str "’" <$ char '\'') + <|> (str "’" <$ char '’') + <|> (str "\160" <$ char '~') + <|> mathDisplay (string "$$" *> mathChars <* string "$$") + <|> mathInline (char '$' *> mathChars <* char '$') + <|> (guardEnabled Ext_literate_haskell *> char '|' *> doLHSverb) + <|> (str . (:[]) <$> tildeEscape) + <|> (do res <- oneOf "#&~^'`\"[]" + pos <- getPosition + report $ ParsingUnescaped [res] pos + return $ str [res]) + +inlines :: PandocMonad m => LP m Inlines +inlines = mconcat <$> many (notFollowedBy (char '}') *> inline) + +inlineGroup :: PandocMonad m => LP m Inlines +inlineGroup = do + ils <- grouped inline + if isNull ils + then return mempty + else return $ spanWith nullAttr ils + -- we need the span so we can detitlecase bibtex entries; + -- we need to know when something is {C}apitalized + +block :: PandocMonad m => LP m Blocks +block = (mempty <$ comment) + <|> (mempty <$ ((spaceChar <|> newline) *> spaces)) + <|> environment + <|> include + <|> macro + <|> blockCommand + <|> paragraph + <|> grouped block + <|> (mempty <$ char '&') -- loose & in table environment + + +blocks :: PandocMonad m => LP m Blocks +blocks = mconcat <$> many block + +getRawCommand :: PandocMonad m => String -> LP m String +getRawCommand name' = do + rawargs <- withRaw (many (try (optional sp *> opt)) *> + option "" (try (optional sp *> dimenarg)) *> + many braced) + return $ '\\' : name' ++ snd rawargs + +lookupListDefault :: (Ord k) => v -> [k] -> M.Map k v -> v +lookupListDefault d = (fromMaybe d .) . lookupList + where + lookupList l m = msum $ map (`M.lookup` m) l + +blockCommand :: PandocMonad m => LP m Blocks +blockCommand = try $ do + name <- anyControlSeq + guard $ name /= "begin" && name /= "end" + star <- option "" (string "*" <* optional sp) + let name' = name ++ star + let raw = do + rawcommand <- getRawCommand name' + transformed <- applyMacros' rawcommand + guard $ transformed /= rawcommand + notFollowedBy $ parseFromString inlines transformed + parseFromString blocks transformed + lookupListDefault raw [name',name] blockCommands + +inBrackets :: Inlines -> Inlines +inBrackets x = str "[" <> x <> str "]" + +-- eat an optional argument and one or more arguments in braces +ignoreInlines :: PandocMonad m => String -> (String, LP m Inlines) +ignoreInlines name = (name, p) + where + p = do oa <- optargs + let rawCommand = '\\':name ++ oa + let doraw = guardRaw >> return (rawInline "latex" rawCommand) + doraw <|> ignore rawCommand + +guardRaw :: PandocMonad m => LP m () +guardRaw = getOption readerExtensions >>= guard . extensionEnabled Ext_raw_tex + +optargs :: PandocMonad m => LP m String +optargs = snd <$> withRaw (skipopts *> skipMany (try $ optional sp *> braced)) + +ignore :: (Monoid a, PandocMonad m) => String -> ParserT s u m a +ignore raw = do + pos <- getPosition + report $ SkippedContent raw pos + return mempty + +ignoreBlocks :: PandocMonad m => String -> (String, LP m Blocks) +ignoreBlocks name = (name, p) + where + p = do oa <- optargs + let rawCommand = '\\':name ++ oa + let doraw = guardRaw >> return (rawBlock "latex" rawCommand) + doraw <|> ignore rawCommand + +blockCommands :: PandocMonad m => M.Map String (LP m Blocks) +blockCommands = M.fromList $ + [ ("par", mempty <$ skipopts) + , ("parbox", braced >> grouped blocks) + , ("title", mempty <$ (skipopts *> + (grouped inline >>= addMeta "title") + <|> (grouped block >>= addMeta "title"))) + , ("subtitle", mempty <$ (skipopts *> tok >>= addMeta "subtitle")) + , ("author", mempty <$ (skipopts *> authors)) + -- -- in letter class, temp. store address & sig as title, author + , ("address", mempty <$ (skipopts *> tok >>= addMeta "address")) + , ("signature", mempty <$ (skipopts *> authors)) + , ("date", mempty <$ (skipopts *> tok >>= addMeta "date")) + -- sectioning + , ("chapter", updateState (\s -> s{ stateHasChapters = True }) + *> section nullAttr 0) + , ("chapter*", updateState (\s -> s{ stateHasChapters = True }) + *> section ("",["unnumbered"],[]) 0) + , ("section", section nullAttr 1) + , ("section*", section ("",["unnumbered"],[]) 1) + , ("subsection", section nullAttr 2) + , ("subsection*", section ("",["unnumbered"],[]) 2) + , ("subsubsection", section nullAttr 3) + , ("subsubsection*", section ("",["unnumbered"],[]) 3) + , ("paragraph", section nullAttr 4) + , ("paragraph*", section ("",["unnumbered"],[]) 4) + , ("subparagraph", section nullAttr 5) + , ("subparagraph*", section ("",["unnumbered"],[]) 5) + -- beamer slides + , ("frametitle", section nullAttr 3) + , ("framesubtitle", section nullAttr 4) + -- letters + , ("opening", (para . trimInlines) <$> (skipopts *> tok)) + , ("closing", skipopts *> closing) + -- + , ("hrule", pure horizontalRule) + , ("strut", pure mempty) + , ("rule", skipopts *> tok *> tok *> pure horizontalRule) + , ("item", skipopts *> looseItem) + , ("documentclass", skipopts *> braced *> preamble) + , ("centerline", (para . trimInlines) <$> (skipopts *> tok)) + , ("caption", skipopts *> setCaption) + , ("bibliography", mempty <$ (skipopts *> braced >>= + addMeta "bibliography" . splitBibs)) + , ("addbibresource", mempty <$ (skipopts *> braced >>= + addMeta "bibliography" . splitBibs)) + -- includes + , ("lstinputlisting", inputListing) + ] ++ map ignoreBlocks + -- these commands will be ignored unless --parse-raw is specified, + -- in which case they will appear as raw latex blocks + [ "newcommand", "renewcommand", "newenvironment", "renewenvironment" + -- newcommand, etc. should be parsed by macro, but we need this + -- here so these aren't parsed as inline commands to ignore + , "special", "pdfannot", "pdfstringdef" + , "bibliographystyle" + , "maketitle", "makeindex", "makeglossary" + , "addcontentsline", "addtocontents", "addtocounter" + -- \ignore{} is used conventionally in literate haskell for definitions + -- that are to be processed by the compiler but not printed. + , "ignore" + , "hyperdef" + , "markboth", "markright", "markleft" + , "newpage" + ] + +addMeta :: PandocMonad m => ToMetaValue a => String -> a -> LP m () +addMeta field val = updateState $ \st -> + st{ stateMeta = addMetaField field val $ stateMeta st } + +splitBibs :: String -> [Inlines] +splitBibs = map (str . flip replaceExtension "bib" . trim) . splitBy (==',') + +setCaption :: PandocMonad m => LP m Blocks +setCaption = do + ils <- tok + mblabel <- option Nothing $ + try $ spaces' >> controlSeq "label" >> (Just <$> tok) + let ils' = case mblabel of + Just lab -> ils <> spanWith + ("",[],[("data-label", stringify lab)]) mempty + Nothing -> ils + updateState $ \st -> st{ stateCaption = Just ils' } + return mempty + +resetCaption :: PandocMonad m => LP m () +resetCaption = updateState $ \st -> st{ stateCaption = Nothing } + +authors :: PandocMonad m => LP m () +authors = try $ do + char '{' + let oneAuthor = mconcat <$> + many1 (notFollowedBy' (controlSeq "and") >> + (inline <|> mempty <$ blockCommand)) + -- skip e.g. \vspace{10pt} + auths <- sepBy oneAuthor (controlSeq "and") + char '}' + addMeta "author" (map trimInlines auths) + +section :: PandocMonad m => Attr -> Int -> LP m Blocks +section (ident, classes, kvs) lvl = do + hasChapters <- stateHasChapters `fmap` getState + let lvl' = if hasChapters then lvl + 1 else lvl + skipopts + contents <- grouped inline + lab <- option ident $ try (spaces' >> controlSeq "label" >> spaces' >> braced) + attr' <- registerHeader (lab, classes, kvs) contents + return $ headerWith attr' lvl' contents + +inlineCommand :: PandocMonad m => LP m Inlines +inlineCommand = try $ do + name <- anyControlSeq + guard $ name /= "begin" && name /= "end" + guard $ not $ isBlockCommand name + exts <- getOption readerExtensions + star <- option "" (string "*") + let name' = name ++ star + let raw = do + rawargs <- withRaw + (skipangles *> skipopts *> option "" dimenarg *> many braced) + let rawcommand = '\\' : name ++ star ++ snd rawargs + transformed <- applyMacros' rawcommand + if transformed /= rawcommand + then parseFromString inlines transformed + else if extensionEnabled Ext_raw_tex exts + then return $ rawInline "latex" rawcommand + else ignore rawcommand + (lookupListDefault mzero [name',name] inlineCommands <* + optional (try (string "{}"))) + <|> raw + +unlessParseRaw :: PandocMonad m => LP m () +unlessParseRaw = getOption readerExtensions >>= + guard . not . extensionEnabled Ext_raw_tex + +isBlockCommand :: String -> Bool +isBlockCommand s = s `M.member` (blockCommands :: M.Map String (LP PandocPure Blocks)) + + +inlineEnvironments :: PandocMonad m => M.Map String (LP m Inlines) +inlineEnvironments = M.fromList + [ ("displaymath", mathEnv id Nothing "displaymath") + , ("math", math <$> verbEnv "math") + , ("equation", mathEnv id Nothing "equation") + , ("equation*", mathEnv id Nothing "equation*") + , ("gather", mathEnv id (Just "gathered") "gather") + , ("gather*", mathEnv id (Just "gathered") "gather*") + , ("multline", mathEnv id (Just "gathered") "multline") + , ("multline*", mathEnv id (Just "gathered") "multline*") + , ("eqnarray", mathEnv id (Just "aligned") "eqnarray") + , ("eqnarray*", mathEnv id (Just "aligned") "eqnarray*") + , ("align", mathEnv id (Just "aligned") "align") + , ("align*", mathEnv id (Just "aligned") "align*") + , ("alignat", mathEnv id (Just "aligned") "alignat") + , ("alignat*", mathEnv id (Just "aligned") "alignat*") + ] + +inlineCommands :: PandocMonad m => M.Map String (LP m Inlines) +inlineCommands = M.fromList $ + [ ("emph", extractSpaces emph <$> tok) + , ("textit", extractSpaces emph <$> tok) + , ("textsl", extractSpaces emph <$> tok) + , ("textsc", extractSpaces smallcaps <$> tok) + , ("sout", extractSpaces strikeout <$> tok) + , ("textsuperscript", extractSpaces superscript <$> tok) + , ("textsubscript", extractSpaces subscript <$> tok) + , ("textbackslash", lit "\\") + , ("backslash", lit "\\") + , ("slash", lit "/") + , ("textbf", extractSpaces strong <$> tok) + , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok) + , ("ldots", lit "…") + , ("dots", lit "…") + , ("mdots", lit "…") + , ("sim", lit "~") + , ("label", unlessParseRaw >> (inBrackets <$> tok)) + , ("ref", unlessParseRaw >> (inBrackets <$> tok)) + , ("noindent", unlessParseRaw >> ignore "noindent") + , ("textgreek", tok) + , ("sep", lit ",") + , ("cref", unlessParseRaw >> (inBrackets <$> tok)) -- from cleveref.sty + , ("(", mathInline $ manyTill anyChar (try $ string "\\)")) + , ("[", mathDisplay $ manyTill anyChar (try $ string "\\]")) + , ("ensuremath", mathInline braced) + , ("texorpdfstring", (\_ x -> x) <$> tok <*> tok) + , ("P", lit "¶") + , ("S", lit "§") + , ("$", lit "$") + , ("%", lit "%") + , ("&", lit "&") + , ("#", lit "#") + , ("_", lit "_") + , ("{", lit "{") + , ("}", lit "}") + -- old TeX commands + , ("em", extractSpaces emph <$> inlines) + , ("it", extractSpaces emph <$> inlines) + , ("sl", extractSpaces emph <$> inlines) + , ("bf", extractSpaces strong <$> inlines) + , ("rm", inlines) + , ("itshape", extractSpaces emph <$> inlines) + , ("slshape", extractSpaces emph <$> inlines) + , ("scshape", extractSpaces smallcaps <$> inlines) + , ("bfseries", extractSpaces strong <$> inlines) + , ("/", pure mempty) -- italic correction + , ("aa", lit "å") + , ("AA", lit "Å") + , ("ss", lit "ß") + , ("o", lit "ø") + , ("O", lit "Ø") + , ("L", lit "Ł") + , ("l", lit "ł") + , ("ae", lit "æ") + , ("AE", lit "Æ") + , ("oe", lit "œ") + , ("OE", lit "Œ") + , ("pounds", lit "£") + , ("euro", lit "€") + , ("copyright", lit "©") + , ("textasciicircum", lit "^") + , ("textasciitilde", lit "~") + , ("H", try $ tok >>= accent hungarumlaut) + , ("`", option (str "`") $ try $ tok >>= accent grave) + , ("'", option (str "'") $ try $ tok >>= accent acute) + , ("^", option (str "^") $ try $ tok >>= accent circ) + , ("~", option (str "~") $ try $ tok >>= accent tilde) + , ("\"", option (str "\"") $ try $ tok >>= accent umlaut) + , (".", option (str ".") $ try $ tok >>= accent dot) + , ("=", option (str "=") $ try $ tok >>= accent macron) + , ("c", option (str "c") $ try $ tok >>= accent cedilla) + , ("v", option (str "v") $ try $ tok >>= accent hacek) + , ("u", option (str "u") $ try $ tok >>= accent breve) + , ("i", lit "i") + , ("\\", linebreak <$ (optional (bracketed inline) *> spaces')) + , (",", pure mempty) + , ("@", pure mempty) + , (" ", lit "\160") + , ("ps", pure $ str "PS." <> space) + , ("TeX", lit "TeX") + , ("LaTeX", lit "LaTeX") + , ("bar", lit "|") + , ("textless", lit "<") + , ("textgreater", lit ">") + , ("thanks", (note . mconcat) <$> (char '{' *> manyTill block (char '}'))) + , ("footnote", (note . mconcat) <$> (char '{' *> manyTill block (char '}'))) + , ("verb", doverb) + , ("lstinline", skipopts *> doverb) + , ("Verb", doverb) + , ("texttt", (code . stringify . toList) <$> tok) + , ("url", (unescapeURL <$> braced) >>= \url -> + pure (link url "" (str url))) + , ("href", (unescapeURL <$> braced <* optional sp) >>= \url -> + tok >>= \lab -> + pure (link url "" lab)) + , ("includegraphics", do options <- option [] keyvals + src <- unescapeURL . removeDoubleQuotes <$> braced + mkImage options src) + , ("enquote", enquote) + , ("cite", citation "cite" NormalCitation False) + , ("Cite", citation "Cite" NormalCitation False) + , ("citep", citation "citep" NormalCitation False) + , ("citep*", citation "citep*" NormalCitation False) + , ("citeal", citation "citeal" NormalCitation False) + , ("citealp", citation "citealp" NormalCitation False) + , ("citealp*", citation "citealp*" NormalCitation False) + , ("autocite", citation "autocite" NormalCitation False) + , ("smartcite", citation "smartcite" NormalCitation False) + , ("footcite", inNote <$> citation "footcite" NormalCitation False) + , ("parencite", citation "parencite" NormalCitation False) + , ("supercite", citation "supercite" NormalCitation False) + , ("footcitetext", inNote <$> citation "footcitetext" NormalCitation False) + , ("citeyearpar", citation "citeyearpar" SuppressAuthor False) + , ("citeyear", citation "citeyear" SuppressAuthor False) + , ("autocite*", citation "autocite*" SuppressAuthor False) + , ("cite*", citation "cite*" SuppressAuthor False) + , ("parencite*", citation "parencite*" SuppressAuthor False) + , ("textcite", citation "textcite" AuthorInText False) + , ("citet", citation "citet" AuthorInText False) + , ("citet*", citation "citet*" AuthorInText False) + , ("citealt", citation "citealt" AuthorInText False) + , ("citealt*", citation "citealt*" AuthorInText False) + , ("textcites", citation "textcites" AuthorInText True) + , ("cites", citation "cites" NormalCitation True) + , ("autocites", citation "autocites" NormalCitation True) + , ("footcites", inNote <$> citation "footcites" NormalCitation True) + , ("parencites", citation "parencites" NormalCitation True) + , ("supercites", citation "supercites" NormalCitation True) + , ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True) + , ("Autocite", citation "Autocite" NormalCitation False) + , ("Smartcite", citation "Smartcite" NormalCitation False) + , ("Footcite", citation "Footcite" NormalCitation False) + , ("Parencite", citation "Parencite" NormalCitation False) + , ("Supercite", citation "Supercite" NormalCitation False) + , ("Footcitetext", inNote <$> citation "Footcitetext" NormalCitation False) + , ("Citeyearpar", citation "Citeyearpar" SuppressAuthor False) + , ("Citeyear", citation "Citeyear" SuppressAuthor False) + , ("Autocite*", citation "Autocite*" SuppressAuthor False) + , ("Cite*", citation "Cite*" SuppressAuthor False) + , ("Parencite*", citation "Parencite*" SuppressAuthor False) + , ("Textcite", citation "Textcite" AuthorInText False) + , ("Textcites", citation "Textcites" AuthorInText True) + , ("Cites", citation "Cites" NormalCitation True) + , ("Autocites", citation "Autocites" NormalCitation True) + , ("Footcites", citation "Footcites" NormalCitation True) + , ("Parencites", citation "Parencites" NormalCitation True) + , ("Supercites", citation "Supercites" NormalCitation True) + , ("Footcitetexts", inNote <$> citation "Footcitetexts" NormalCitation True) + , ("citetext", complexNatbibCitation NormalCitation) + , ("citeauthor", (try (tok *> optional sp *> controlSeq "citetext") *> + complexNatbibCitation AuthorInText) + <|> citation "citeauthor" AuthorInText False) + , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>= + addMeta "nocite")) + ] ++ map ignoreInlines + -- these commands will be ignored unless --parse-raw is specified, + -- in which case they will appear as raw latex blocks: + [ "index" ] + +mkImage :: PandocMonad m => [(String, String)] -> String -> LP m Inlines +mkImage options src = do + let replaceTextwidth (k,v) = case numUnit v of + Just (num, "\\textwidth") -> (k, showFl (num * 100) ++ "%") + _ -> (k, v) + let kvs = map replaceTextwidth $ filter (\(k,_) -> k `elem` ["width", "height"]) options + let attr = ("",[], kvs) + let alt = str "image" + case takeExtension src of + "" -> do + defaultExt <- getOption readerDefaultImageExtension + return $ imageWith attr (addExtension src defaultExt) "" alt + _ -> return $ imageWith attr src "" alt + +inNote :: Inlines -> Inlines +inNote ils = + note $ para $ ils <> str "." + +unescapeURL :: String -> String +unescapeURL ('\\':x:xs) | isEscapable x = x:unescapeURL xs + where isEscapable c = c `elem` ("#$%&~_^\\{}" :: String) +unescapeURL (x:xs) = x:unescapeURL xs +unescapeURL [] = "" + +enquote :: PandocMonad m => LP m Inlines +enquote = do + skipopts + context <- stateQuoteContext <$> getState + if context == InDoubleQuote + then singleQuoted <$> withQuoteContext InSingleQuote tok + else doubleQuoted <$> withQuoteContext InDoubleQuote tok + +doverb :: PandocMonad m => LP m Inlines +doverb = do + marker <- anyChar + code <$> manyTill (satisfy (/='\n')) (char marker) + +doLHSverb :: PandocMonad m => LP m Inlines +doLHSverb = codeWith ("",["haskell"],[]) <$> manyTill (satisfy (/='\n')) (char '|') + +lit :: String -> LP m Inlines +lit = pure . str + +accent :: (Char -> String) -> Inlines -> LP m Inlines +accent f ils = + case toList ils of + (Str (x:xs) : ys) -> return $ fromList (Str (f x ++ xs) : ys) + [] -> mzero + _ -> return ils + +grave :: Char -> String +grave 'A' = "À" +grave 'E' = "È" +grave 'I' = "Ì" +grave 'O' = "Ò" +grave 'U' = "Ù" +grave 'a' = "à" +grave 'e' = "è" +grave 'i' = "ì" +grave 'o' = "ò" +grave 'u' = "ù" +grave c = [c] + +acute :: Char -> String +acute 'A' = "Á" +acute 'E' = "É" +acute 'I' = "Í" +acute 'O' = "Ó" +acute 'U' = "Ú" +acute 'Y' = "Ý" +acute 'a' = "á" +acute 'e' = "é" +acute 'i' = "í" +acute 'o' = "ó" +acute 'u' = "ú" +acute 'y' = "ý" +acute 'C' = "Ć" +acute 'c' = "ć" +acute 'L' = "Ĺ" +acute 'l' = "ĺ" +acute 'N' = "Ń" +acute 'n' = "ń" +acute 'R' = "Ŕ" +acute 'r' = "ŕ" +acute 'S' = "Ś" +acute 's' = "ś" +acute 'Z' = "Ź" +acute 'z' = "ź" +acute c = [c] + +circ :: Char -> String +circ 'A' = "Â" +circ 'E' = "Ê" +circ 'I' = "Î" +circ 'O' = "Ô" +circ 'U' = "Û" +circ 'a' = "â" +circ 'e' = "ê" +circ 'i' = "î" +circ 'o' = "ô" +circ 'u' = "û" +circ 'C' = "Ĉ" +circ 'c' = "ĉ" +circ 'G' = "Ĝ" +circ 'g' = "ĝ" +circ 'H' = "Ĥ" +circ 'h' = "ĥ" +circ 'J' = "Ĵ" +circ 'j' = "ĵ" +circ 'S' = "Ŝ" +circ 's' = "ŝ" +circ 'W' = "Ŵ" +circ 'w' = "ŵ" +circ 'Y' = "Ŷ" +circ 'y' = "ŷ" +circ c = [c] + +tilde :: Char -> String +tilde 'A' = "Ã" +tilde 'a' = "ã" +tilde 'O' = "Õ" +tilde 'o' = "õ" +tilde 'I' = "Ĩ" +tilde 'i' = "ĩ" +tilde 'U' = "Ũ" +tilde 'u' = "ũ" +tilde 'N' = "Ñ" +tilde 'n' = "ñ" +tilde c = [c] + +umlaut :: Char -> String +umlaut 'A' = "Ä" +umlaut 'E' = "Ë" +umlaut 'I' = "Ï" +umlaut 'O' = "Ö" +umlaut 'U' = "Ü" +umlaut 'a' = "ä" +umlaut 'e' = "ë" +umlaut 'i' = "ï" +umlaut 'o' = "ö" +umlaut 'u' = "ü" +umlaut c = [c] + +hungarumlaut :: Char -> String +hungarumlaut 'A' = "A̋" +hungarumlaut 'E' = "E̋" +hungarumlaut 'I' = "I̋" +hungarumlaut 'O' = "Ő" +hungarumlaut 'U' = "Ű" +hungarumlaut 'Y' = "ӳ" +hungarumlaut 'a' = "a̋" +hungarumlaut 'e' = "e̋" +hungarumlaut 'i' = "i̋" +hungarumlaut 'o' = "ő" +hungarumlaut 'u' = "ű" +hungarumlaut 'y' = "ӳ" +hungarumlaut c = [c] + +dot :: Char -> String +dot 'C' = "Ċ" +dot 'c' = "ċ" +dot 'E' = "Ė" +dot 'e' = "ė" +dot 'G' = "Ġ" +dot 'g' = "ġ" +dot 'I' = "İ" +dot 'Z' = "Ż" +dot 'z' = "ż" +dot c = [c] + +macron :: Char -> String +macron 'A' = "Ā" +macron 'E' = "Ē" +macron 'I' = "Ī" +macron 'O' = "Ō" +macron 'U' = "Ū" +macron 'a' = "ā" +macron 'e' = "ē" +macron 'i' = "ī" +macron 'o' = "ō" +macron 'u' = "ū" +macron c = [c] + +cedilla :: Char -> String +cedilla 'c' = "ç" +cedilla 'C' = "Ç" +cedilla 's' = "ş" +cedilla 'S' = "Ş" +cedilla 't' = "ţ" +cedilla 'T' = "Ţ" +cedilla 'e' = "ȩ" +cedilla 'E' = "Ȩ" +cedilla 'h' = "ḩ" +cedilla 'H' = "Ḩ" +cedilla 'o' = "o̧" +cedilla 'O' = "O̧" +cedilla c = [c] + +hacek :: Char -> String +hacek 'A' = "Ǎ" +hacek 'a' = "ǎ" +hacek 'C' = "Č" +hacek 'c' = "č" +hacek 'D' = "Ď" +hacek 'd' = "ď" +hacek 'E' = "Ě" +hacek 'e' = "ě" +hacek 'G' = "Ǧ" +hacek 'g' = "ǧ" +hacek 'H' = "Ȟ" +hacek 'h' = "ȟ" +hacek 'I' = "Ǐ" +hacek 'i' = "ǐ" +hacek 'j' = "ǰ" +hacek 'K' = "Ǩ" +hacek 'k' = "ǩ" +hacek 'L' = "Ľ" +hacek 'l' = "ľ" +hacek 'N' = "Ň" +hacek 'n' = "ň" +hacek 'O' = "Ǒ" +hacek 'o' = "ǒ" +hacek 'R' = "Ř" +hacek 'r' = "ř" +hacek 'S' = "Š" +hacek 's' = "š" +hacek 'T' = "Ť" +hacek 't' = "ť" +hacek 'U' = "Ǔ" +hacek 'u' = "ǔ" +hacek 'Z' = "Ž" +hacek 'z' = "ž" +hacek c = [c] + +breve :: Char -> String +breve 'A' = "Ă" +breve 'a' = "ă" +breve 'E' = "Ĕ" +breve 'e' = "ĕ" +breve 'G' = "Ğ" +breve 'g' = "ğ" +breve 'I' = "Ĭ" +breve 'i' = "ĭ" +breve 'O' = "Ŏ" +breve 'o' = "ŏ" +breve 'U' = "Ŭ" +breve 'u' = "ŭ" +breve c = [c] + +tok :: PandocMonad m => LP m Inlines +tok = try $ grouped inline <|> inlineCommand <|> str <$> count 1 inlineChar + +opt :: PandocMonad m => LP m Inlines +opt = bracketed inline + +rawopt :: PandocMonad m => LP m String +rawopt = do + contents <- bracketed (many1 (noneOf "[]") <|> try (string "\\]") <|> + try (string "\\[") <|> rawopt) + optional sp + return $ "[" ++ contents ++ "]" + +skipopts :: PandocMonad m => LP m () +skipopts = skipMany rawopt + +-- opts in angle brackets are used in beamer +rawangle :: PandocMonad m => LP m () +rawangle = try $ do + char '<' + skipMany (noneOf ">") + char '>' + return () + +skipangles :: PandocMonad m => LP m () +skipangles = skipMany rawangle + +inlineText :: PandocMonad m => LP m Inlines +inlineText = str <$> many1 inlineChar + +inlineChar :: PandocMonad m => LP m Char +inlineChar = noneOf "\\$%&~#{}^'`\"‘’“”-[] \t\n" + +environment :: PandocMonad m => LP m Blocks +environment = do + controlSeq "begin" + name <- braced + M.findWithDefault mzero name environments + <|> rawEnv name + +inlineEnvironment :: PandocMonad m => LP m Inlines +inlineEnvironment = try $ do + controlSeq "begin" + name <- braced + M.findWithDefault mzero name inlineEnvironments + +rawEnv :: PandocMonad m => String -> LP m Blocks +rawEnv name = do + exts <- getOption readerExtensions + let parseRaw = extensionEnabled Ext_raw_tex exts + rawOptions <- mconcat <$> many rawopt + let beginCommand = "\\begin{" ++ name ++ "}" ++ rawOptions + unless parseRaw $ do + pos1 <- getPosition + report $ SkippedContent beginCommand pos1 + (bs, raw) <- withRaw $ env name blocks + raw' <- applyMacros' raw + if parseRaw + then return $ rawBlock "latex" $ beginCommand ++ raw' + else do + pos2 <- getPosition + report $ SkippedContent ("\\end{" ++ name ++ "}") pos2 + return bs + +---- + +braced' :: PandocMonad m => LP m String +braced' = try $ char '{' *> manyTill (satisfy (/='}')) (char '}') + +maybeAddExtension :: String -> FilePath -> FilePath +maybeAddExtension ext fp = + if null (takeExtension fp) + then addExtension fp ext + else fp + +include :: PandocMonad m => LP m Blocks +include = do + fs' <- try $ do + char '\\' + name <- try (string "include") + <|> try (string "input") + <|> string "usepackage" + -- skip options + skipMany $ try $ char '[' *> manyTill anyChar (char ']') + fs <- (map trim . splitBy (==',')) <$> braced' + return $ if name == "usepackage" + then map (maybeAddExtension ".sty") fs + else map (maybeAddExtension ".tex") fs + dirs <- (splitBy (==':') . fromMaybe ".") <$> lookupEnv "TEXINPUTS" + mconcat <$> mapM (insertIncludedFile blocks dirs) fs' + +inputListing :: PandocMonad m => LP m Blocks +inputListing = do + pos <- getPosition + options <- option [] keyvals + f <- filter (/='"') <$> braced + dirs <- (splitBy (==':') . fromMaybe ".") <$> lookupEnv "TEXINPUTS" + mbCode <- readFileFromDirs dirs f + codeLines <- case mbCode of + Just s -> return $ lines s + Nothing -> do + report $ CouldNotLoadIncludeFile f pos + return [] + let (ident,classes,kvs) = parseListingsOptions options + let language = case lookup "language" options >>= fromListingsLanguage of + Just l -> [l] + Nothing -> take 1 $ languagesByExtension (takeExtension f) + let firstline = fromMaybe 1 $ lookup "firstline" options >>= safeRead + let lastline = fromMaybe (length codeLines) $ + lookup "lastline" options >>= safeRead + let codeContents = intercalate "\n" $ take (1 + lastline - firstline) $ + drop (firstline - 1) codeLines + return $ codeBlockWith (ident,ordNub (classes ++ language),kvs) codeContents + +parseListingsOptions :: [(String, String)] -> Attr +parseListingsOptions options = + let kvs = [ (if k == "firstnumber" + then "startFrom" + else k, v) | (k,v) <- options ] + classes = [ "numberLines" | + lookup "numbers" options == Just "left" ] + ++ maybeToList (lookup "language" options + >>= fromListingsLanguage) + in (fromMaybe "" (lookup "label" options), classes, kvs) + +---- + +keyval :: PandocMonad m => LP m (String, String) +keyval = try $ do + key <- many1 alphaNum + val <- option "" $ char '=' >> many1 (alphaNum <|> char '.' <|> char '\\') + skipMany spaceChar + optional (char ',') + skipMany spaceChar + return (key, val) + + +keyvals :: PandocMonad m => LP m [(String, String)] +keyvals = try $ char '[' *> manyTill keyval (char ']') + +alltt :: PandocMonad m => String -> LP m Blocks +alltt t = walk strToCode <$> parseFromString blocks + (substitute " " "\\ " $ substitute "%" "\\%" $ + intercalate "\\\\\n" $ lines t) + where strToCode (Str s) = Code nullAttr s + strToCode x = x + +rawLaTeXBlock :: PandocMonad m => LP m String +rawLaTeXBlock = snd <$> try (withRaw (environment <|> blockCommand)) + +rawLaTeXInline :: PandocMonad m => LP m Inline +rawLaTeXInline = do + raw <- (snd <$> withRaw inlineCommand) <|> (snd <$> withRaw blockCommand) + RawInline "latex" <$> applyMacros' raw + +addImageCaption :: PandocMonad m => Blocks -> LP m Blocks +addImageCaption = walkM go + where go (Image attr alt (src,tit)) = do + mbcapt <- stateCaption <$> getState + return $ case mbcapt of + Just ils -> Image attr (toList ils) (src, "fig:") + Nothing -> Image attr alt (src,tit) + go x = return x + +addTableCaption :: PandocMonad m => Blocks -> LP m Blocks +addTableCaption = walkM go + where go (Table c als ws hs rs) = do + mbcapt <- stateCaption <$> getState + return $ case mbcapt of + Just ils -> Table (toList ils) als ws hs rs + Nothing -> Table c als ws hs rs + go x = return x + +environments :: PandocMonad m => M.Map String (LP m Blocks) +environments = M.fromList + [ ("document", env "document" blocks <* skipMany anyChar) + , ("abstract", mempty <$ (env "abstract" blocks >>= addMeta "abstract")) + , ("letter", env "letter" letterContents) + , ("minipage", env "minipage" $ + skipopts *> spaces' *> optional braced *> spaces' *> blocks) + , ("figure", env "figure" $ + resetCaption *> skipopts *> blocks >>= addImageCaption) + , ("center", env "center" blocks) + , ("longtable", env "longtable" $ + resetCaption *> simpTable False >>= addTableCaption) + , ("table", env "table" $ + resetCaption *> skipopts *> blocks >>= addTableCaption) + , ("tabular*", env "tabular" $ simpTable True) + , ("tabular", env "tabular" $ simpTable False) + , ("quote", blockQuote <$> env "quote" blocks) + , ("quotation", blockQuote <$> env "quotation" blocks) + , ("verse", blockQuote <$> env "verse" blocks) + , ("itemize", bulletList <$> listenv "itemize" (many item)) + , ("description", definitionList <$> listenv "description" (many descItem)) + , ("enumerate", orderedList') + , ("alltt", alltt =<< verbEnv "alltt") + , ("code", guardEnabled Ext_literate_haskell *> + (codeBlockWith ("",["sourceCode","literate","haskell"],[]) <$> + verbEnv "code")) + , ("comment", mempty <$ verbEnv "comment") + , ("verbatim", codeBlock <$> verbEnv "verbatim") + , ("Verbatim", fancyverbEnv "Verbatim") + , ("BVerbatim", fancyverbEnv "BVerbatim") + , ("lstlisting", do attr <- parseListingsOptions <$> option [] keyvals + codeBlockWith attr <$> verbEnv "lstlisting") + , ("minted", do options <- option [] keyvals + lang <- grouped (many1 $ satisfy (/='}')) + let kvs = [ (if k == "firstnumber" + then "startFrom" + else k, v) | (k,v) <- options ] + let classes = [ lang | not (null lang) ] ++ + [ "numberLines" | + lookup "linenos" options == Just "true" ] + let attr = ("",classes,kvs) + codeBlockWith attr <$> verbEnv "minted") + , ("obeylines", parseFromString + (para . trimInlines . mconcat <$> many inline) =<< + intercalate "\\\\\n" . lines <$> verbEnv "obeylines") + , ("displaymath", mathEnv para Nothing "displaymath") + , ("equation", mathEnv para Nothing "equation") + , ("equation*", mathEnv para Nothing "equation*") + , ("gather", mathEnv para (Just "gathered") "gather") + , ("gather*", mathEnv para (Just "gathered") "gather*") + , ("multline", mathEnv para (Just "gathered") "multline") + , ("multline*", mathEnv para (Just "gathered") "multline*") + , ("eqnarray", mathEnv para (Just "aligned") "eqnarray") + , ("eqnarray*", mathEnv para (Just "aligned") "eqnarray*") + , ("align", mathEnv para (Just "aligned") "align") + , ("align*", mathEnv para (Just "aligned") "align*") + , ("alignat", mathEnv para (Just "aligned") "alignat") + , ("alignat*", mathEnv para (Just "aligned") "alignat*") + ] + +letterContents :: PandocMonad m => LP m Blocks +letterContents = do + bs <- blocks + st <- getState + -- add signature (author) and address (title) + let addr = case lookupMeta "address" (stateMeta st) of + Just (MetaBlocks [Plain xs]) -> + para $ trimInlines $ fromList xs + _ -> mempty + return $ addr <> bs -- sig added by \closing + +closing :: PandocMonad m => LP m Blocks +closing = do + contents <- tok + st <- getState + let extractInlines (MetaBlocks [Plain ys]) = ys + extractInlines (MetaBlocks [Para ys ]) = ys + extractInlines _ = [] + let sigs = case lookupMeta "author" (stateMeta st) of + Just (MetaList xs) -> + para $ trimInlines $ fromList $ + intercalate [LineBreak] $ map extractInlines xs + _ -> mempty + return $ para (trimInlines contents) <> sigs + +item :: PandocMonad m => LP m Blocks +item = blocks *> controlSeq "item" *> skipopts *> blocks + +looseItem :: PandocMonad m => LP m Blocks +looseItem = do + ctx <- stateParserContext `fmap` getState + if ctx == ListItemState + then mzero + else return mempty + +descItem :: PandocMonad m => LP m (Inlines, [Blocks]) +descItem = do + blocks -- skip blocks before item + controlSeq "item" + optional sp + ils <- opt + bs <- blocks + return (ils, [bs]) + +env :: PandocMonad m => String -> LP m a -> LP m a +env name p = p <* + (try (controlSeq "end" *> braced >>= guard . (== name)) + <?> ("\\end{" ++ name ++ "}")) + +listenv :: PandocMonad m => String -> LP m a -> LP m a +listenv name p = try $ do + oldCtx <- stateParserContext `fmap` getState + updateState $ \st -> st{ stateParserContext = ListItemState } + res <- env name p + updateState $ \st -> st{ stateParserContext = oldCtx } + return res + +mathEnv :: PandocMonad m => (Inlines -> a) -> Maybe String -> String -> LP m a +mathEnv f innerEnv name = f <$> mathDisplay (inner <$> verbEnv name) + where inner x = case innerEnv of + Nothing -> x + Just y -> "\\begin{" ++ y ++ "}\n" ++ x ++ + "\\end{" ++ y ++ "}" + +verbEnv :: PandocMonad m => String -> LP m String +verbEnv name = do + skipopts + optional blankline + let endEnv = try $ controlSeq "end" *> braced >>= guard . (== name) + res <- manyTill anyChar endEnv + return $ stripTrailingNewlines res + +fancyverbEnv :: PandocMonad m => String -> LP m Blocks +fancyverbEnv name = do + options <- option [] keyvals + let kvs = [ (if k == "firstnumber" + then "startFrom" + else k, v) | (k,v) <- options ] + let classes = [ "numberLines" | + lookup "numbers" options == Just "left" ] + let attr = ("",classes,kvs) + codeBlockWith attr <$> verbEnv name + +orderedList' :: PandocMonad m => LP m Blocks +orderedList' = do + optional sp + (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $ + try $ char '[' *> anyOrderedListMarker <* char ']' + spaces + optional $ try $ controlSeq "setlength" *> grouped (controlSeq "itemindent") *> braced + spaces + start <- option 1 $ try $ do controlSeq "setcounter" + grouped (string "enum" *> many1 (oneOf "iv")) + optional sp + num <- grouped (many1 digit) + spaces + return (read num + 1 :: Int) + bs <- listenv "enumerate" (many item) + return $ orderedListWith (start, style, delim) bs + +paragraph :: PandocMonad m => LP m Blocks +paragraph = do + x <- trimInlines . mconcat <$> many1 inline + if x == mempty + then return mempty + else return $ para x + +preamble :: PandocMonad m => LP m Blocks +preamble = mempty <$> manyTill preambleBlock beginDoc + where beginDoc = lookAhead $ try $ controlSeq "begin" *> string "{document}" + preambleBlock = void comment + <|> void sp + <|> void blanklines + <|> void include + <|> void macro + <|> void blockCommand + <|> void anyControlSeq + <|> void braced + <|> void anyChar + +------- + +-- citations + +addPrefix :: [Inline] -> [Citation] -> [Citation] +addPrefix p (k:ks) = k {citationPrefix = p ++ citationPrefix k} : ks +addPrefix _ _ = [] + +addSuffix :: [Inline] -> [Citation] -> [Citation] +addSuffix s ks@(_:_) = + let k = last ks + in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] +addSuffix _ _ = [] + +simpleCiteArgs :: PandocMonad m => LP m [Citation] +simpleCiteArgs = try $ do + first <- optionMaybe $ toList <$> opt + second <- optionMaybe $ toList <$> opt + char '{' + optional sp + keys <- manyTill citationLabel (char '}') + let (pre, suf) = case (first , second ) of + (Just s , Nothing) -> (mempty, s ) + (Just s , Just t ) -> (s , t ) + _ -> (mempty, mempty) + conv k = Citation { citationId = k + , citationPrefix = [] + , citationSuffix = [] + , citationMode = NormalCitation + , citationHash = 0 + , citationNoteNum = 0 + } + return $ addPrefix pre $ addSuffix suf $ map conv keys + +citationLabel :: PandocMonad m => LP m String +citationLabel = optional sp *> + (many1 (satisfy isBibtexKeyChar) + <* optional sp + <* optional (char ',') + <* optional sp) + where isBibtexKeyChar c = isAlphaNum c || c `elem` (".:;?!`'()/*@_+=-[]" :: String) + +cites :: PandocMonad m => CitationMode -> Bool -> LP m [Citation] +cites mode multi = try $ do + cits <- if multi + then many1 simpleCiteArgs + else count 1 simpleCiteArgs + let cs = concat cits + return $ case mode of + AuthorInText -> case cs of + (c:rest) -> c {citationMode = mode} : rest + [] -> [] + _ -> map (\a -> a {citationMode = mode}) cs + +citation :: PandocMonad m => String -> CitationMode -> Bool -> LP m Inlines +citation name mode multi = do + (c,raw) <- withRaw $ cites mode multi + return $ cite c (rawInline "latex" $ "\\" ++ name ++ raw) + +complexNatbibCitation :: PandocMonad m => CitationMode -> LP m Inlines +complexNatbibCitation mode = try $ do + let ils = (toList . trimInlines . mconcat) <$> + many (notFollowedBy (oneOf "\\};") >> inline) + let parseOne = try $ do + skipSpaces + pref <- ils + cit' <- inline -- expect a citation + let citlist = toList cit' + cits' <- case citlist of + [Cite cs _] -> return cs + _ -> mzero + suff <- ils + skipSpaces + optional $ char ';' + return $ addPrefix pref $ addSuffix suff cits' + (c:cits, raw) <- withRaw $ grouped parseOne + return $ cite (c{ citationMode = mode }:cits) + (rawInline "latex" $ "\\citetext" ++ raw) + +-- tables + +parseAligns :: PandocMonad m => LP m [(String, Alignment, String)] +parseAligns = try $ do + char '{' + let maybeBar = skipMany $ sp <|> () <$ char '|' <|> () <$ (char '@' >> braced) + maybeBar + let cAlign = AlignCenter <$ char 'c' + let lAlign = AlignLeft <$ char 'l' + let rAlign = AlignRight <$ char 'r' + let parAlign = AlignLeft <$ (char 'p' >> braced) + let alignChar = cAlign <|> lAlign <|> rAlign <|> parAlign + let alignPrefix = char '>' >> braced + let alignSuffix = char '<' >> braced + let alignSpec = do + spaces + pref <- option "" alignPrefix + spaces + ch <- alignChar + spaces + suff <- option "" alignSuffix + return (pref, ch, suff) + aligns' <- sepEndBy alignSpec maybeBar + spaces + char '}' + spaces + return $ aligns' + +hline :: PandocMonad m => LP m () +hline = try $ do + spaces' + controlSeq "hline" <|> + -- booktabs rules: + controlSeq "toprule" <|> + controlSeq "bottomrule" <|> + controlSeq "midrule" <|> + controlSeq "endhead" <|> + controlSeq "endfirsthead" + spaces' + optional $ bracketed (many1 (satisfy (/=']'))) + return () + +lbreak :: PandocMonad m => LP m () +lbreak = () <$ try (spaces' *> + (controlSeq "\\" <|> controlSeq "tabularnewline") <* + spaces') + +amp :: PandocMonad m => LP m () +amp = () <$ try (spaces' *> char '&' <* spaces') + +parseTableRow :: PandocMonad m + => Int -- ^ number of columns + -> [String] -- ^ prefixes + -> [String] -- ^ suffixes + -> LP m [Blocks] +parseTableRow cols prefixes suffixes = try $ do + let tableCellRaw = many (notFollowedBy + (amp <|> lbreak <|> + (() <$ try (string "\\end"))) >> anyChar) + let minipage = try $ controlSeq "begin" *> string "{minipage}" *> + env "minipage" + (skipopts *> spaces' *> optional braced *> spaces' *> blocks) + let tableCell = minipage <|> + ((plain . trimInlines . mconcat) <$> many inline) + rawcells <- sepBy1 tableCellRaw amp + guard $ length rawcells == cols + let rawcells' = zipWith3 (\c p s -> p ++ trim c ++ s) + rawcells prefixes suffixes + cells' <- mapM (parseFromString tableCell) rawcells' + let numcells = length cells' + guard $ numcells <= cols && numcells >= 1 + guard $ cells' /= [mempty] + -- note: a & b in a three-column table leaves an empty 3rd cell: + let cells'' = cells' ++ replicate (cols - numcells) mempty + spaces' + return cells'' + +spaces' :: PandocMonad m => LP m () +spaces' = spaces *> skipMany (comment *> spaces) + +simpTable :: PandocMonad m => Bool -> LP m Blocks +simpTable hasWidthParameter = try $ do + when hasWidthParameter $ () <$ (spaces' >> tok) + skipopts + (prefixes, aligns, suffixes) <- unzip3 <$> parseAligns + let cols = length aligns + optional $ controlSeq "caption" *> skipopts *> setCaption + optional lbreak + spaces' + skipMany hline + spaces' + header' <- option [] $ try (parseTableRow cols prefixes suffixes <* + lbreak <* many1 hline) + spaces' + rows <- sepEndBy (parseTableRow cols prefixes suffixes) + (lbreak <* optional (skipMany hline)) + spaces' + optional $ controlSeq "caption" *> skipopts *> setCaption + optional lbreak + spaces' + let header'' = if null header' + then replicate cols mempty + else header' + lookAhead $ controlSeq "end" -- make sure we're at end + return $ table mempty (zip aligns (repeat 0)) header'' rows + +removeDoubleQuotes :: String -> String +removeDoubleQuotes ('"':xs) = + case reverse xs of + '"':ys -> reverse ys + _ -> '"':xs +removeDoubleQuotes xs = xs diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs new file mode 100644 index 000000000..80a1cd7a2 --- /dev/null +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -0,0 +1,2119 @@ +{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 +{-# LANGUAGE ScopedTypeVariables #-} + +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Markdown + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of markdown-formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Markdown ( readMarkdown ) where + +import Data.List ( transpose, sortBy, findIndex, intercalate ) +import qualified Data.Map as M +import Data.Scientific (coefficient, base10Exponent) +import Data.Ord ( comparing ) +import Data.Char ( isSpace, isAlphaNum, toLower, isPunctuation ) +import Data.Maybe +import Text.Pandoc.Definition +import Text.Pandoc.Emoji (emojis) +import Text.Pandoc.Generic (bottomUp) +import qualified Data.Text as T +import Data.Text (Text) +import qualified Data.Yaml as Yaml +import Data.Yaml (ParseException(..), YamlException(..), YamlMark(..)) +import qualified Data.HashMap.Strict as H +import qualified Text.Pandoc.Builder as B +import qualified Text.Pandoc.UTF8 as UTF8 +import qualified Data.Vector as V +import Text.Pandoc.Builder (Inlines, Blocks, trimInlines) +import Text.Pandoc.Options +import Text.Pandoc.Logging +import Text.Pandoc.Shared +import Text.Pandoc.Pretty (charWidth) +import Text.Pandoc.XML (fromEntities) +import Text.Pandoc.Parsing hiding (tableWith) +import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) +import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockTag, + isTextTag, isCommentTag ) +import Control.Monad +import System.FilePath (takeExtension, addExtension) +import Text.HTML.TagSoup +import Data.Monoid ((<>)) +import Control.Monad.Trans (lift) +import Control.Monad.Except (throwError, catchError) +import Text.Pandoc.Class (PandocMonad, report) + +type MarkdownParser m = ParserT [Char] ParserState m + +-- | Read markdown from an input string and return a Pandoc document. +readMarkdown :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> m Pandoc +readMarkdown opts s = do + parsed <- (readWithM parseMarkdown) def{ stateOptions = opts } (s ++ "\n\n") + case parsed of + Right result -> return result + Left e -> throwError e + +trimInlinesF :: F Inlines -> F Inlines +trimInlinesF = liftM trimInlines + +-- +-- Constants and data structure definitions +-- + +isBulletListMarker :: Char -> Bool +isBulletListMarker '*' = True +isBulletListMarker '+' = True +isBulletListMarker '-' = True +isBulletListMarker _ = False + +isHruleChar :: Char -> Bool +isHruleChar '*' = True +isHruleChar '-' = True +isHruleChar '_' = True +isHruleChar _ = False + +setextHChars :: String +setextHChars = "=-" + +isBlank :: Char -> Bool +isBlank ' ' = True +isBlank '\t' = True +isBlank '\n' = True +isBlank _ = False + +-- +-- auxiliary functions +-- + +-- | Succeeds when we're in list context. +inList :: PandocMonad m => MarkdownParser m () +inList = do + ctx <- stateParserContext <$> getState + guard (ctx == ListItemState) + +spnl :: PandocMonad m => ParserT [Char] st m () +spnl = try $ do + skipSpaces + optional newline + skipSpaces + notFollowedBy (char '\n') + +indentSpaces :: PandocMonad m => MarkdownParser m String +indentSpaces = try $ do + tabStop <- getOption readerTabStop + count tabStop (char ' ') <|> + string "\t" <?> "indentation" + +nonindentSpaces :: PandocMonad m => MarkdownParser m String +nonindentSpaces = do + tabStop <- getOption readerTabStop + sps <- many (char ' ') + if length sps < tabStop + then return sps + else unexpected "indented line" + +-- returns number of spaces parsed +skipNonindentSpaces :: PandocMonad m => MarkdownParser m Int +skipNonindentSpaces = do + tabStop <- getOption readerTabStop + atMostSpaces (tabStop - 1) <* notFollowedBy (char ' ') + +atMostSpaces :: PandocMonad m => Int -> MarkdownParser m Int +atMostSpaces n + | n > 0 = (char ' ' >> (+1) <$> atMostSpaces (n-1)) <|> return 0 + | otherwise = return 0 + +litChar :: PandocMonad m => MarkdownParser m Char +litChar = escapedChar' + <|> characterReference + <|> noneOf "\n" + <|> try (newline >> notFollowedBy blankline >> return ' ') + +-- | Parse a sequence of inline elements between square brackets, +-- including inlines between balanced pairs of square brackets. +inlinesInBalancedBrackets :: PandocMonad m => MarkdownParser m (F Inlines) +inlinesInBalancedBrackets = do + char '[' + (_, raw) <- withRaw $ charsInBalancedBrackets 1 + guard $ not $ null raw + parseFromString (trimInlinesF . mconcat <$> many inline) (init raw) + +charsInBalancedBrackets :: PandocMonad m => Int -> MarkdownParser m () +charsInBalancedBrackets 0 = return () +charsInBalancedBrackets openBrackets = + (char '[' >> charsInBalancedBrackets (openBrackets + 1)) + <|> (char ']' >> charsInBalancedBrackets (openBrackets - 1)) + <|> (( (() <$ code) + <|> (() <$ (escapedChar')) + <|> (newline >> notFollowedBy blankline) + <|> skipMany1 (noneOf "[]`\n\\") + <|> (() <$ count 1 (oneOf "`\\")) + ) >> charsInBalancedBrackets openBrackets) + +-- +-- document structure +-- + +rawTitleBlockLine :: PandocMonad m => MarkdownParser m String +rawTitleBlockLine = do + char '%' + skipSpaces + first <- anyLine + rest <- many $ try $ do spaceChar + notFollowedBy blankline + skipSpaces + anyLine + return $ trim $ unlines (first:rest) + +titleLine :: PandocMonad m => MarkdownParser m (F Inlines) +titleLine = try $ do + raw <- rawTitleBlockLine + res <- parseFromString (many inline) raw + return $ trimInlinesF $ mconcat res + +authorsLine :: PandocMonad m => MarkdownParser m (F [Inlines]) +authorsLine = try $ do + raw <- rawTitleBlockLine + let sep = (char ';' <* spaces) <|> newline + let pAuthors = sepEndBy + (trimInlinesF . mconcat <$> many + (try $ notFollowedBy sep >> inline)) + sep + sequence <$> parseFromString pAuthors raw + +dateLine :: PandocMonad m => MarkdownParser m (F Inlines) +dateLine = try $ do + raw <- rawTitleBlockLine + res <- parseFromString (many inline) raw + return $ trimInlinesF $ mconcat res + +titleBlock :: PandocMonad m => MarkdownParser m () +titleBlock = pandocTitleBlock <|> mmdTitleBlock + +pandocTitleBlock :: PandocMonad m => MarkdownParser m () +pandocTitleBlock = try $ do + guardEnabled Ext_pandoc_title_block + lookAhead (char '%') + title <- option mempty titleLine + author <- option (return []) authorsLine + date <- option mempty dateLine + optional blanklines + let meta' = do title' <- title + author' <- author + date' <- date + return $ + (if B.isNull title' then id else B.setMeta "title" title') + . (if null author' then id else B.setMeta "author" author') + . (if B.isNull date' then id else B.setMeta "date" date') + $ nullMeta + updateState $ \st -> st{ stateMeta' = stateMeta' st <> meta' } + + +-- Adapted from solution at +-- http://stackoverflow.com/a/29448764/1901888 +foldrWithKeyM :: Monad m => (k -> b -> a -> m a) -> a -> H.HashMap k b -> m a +foldrWithKeyM f acc = H.foldrWithKey f' (return acc) + where + f' k b ma = ma >>= \a -> f k b a + +yamlMetaBlock :: PandocMonad m => MarkdownParser m (F Blocks) +yamlMetaBlock = try $ do + guardEnabled Ext_yaml_metadata_block + pos <- getPosition + string "---" + blankline + notFollowedBy blankline -- if --- is followed by a blank it's an HRULE + rawYamlLines <- manyTill anyLine stopLine + -- by including --- and ..., we allow yaml blocks with just comments: + let rawYaml = unlines ("---" : (rawYamlLines ++ ["..."])) + optional blanklines + opts <- stateOptions <$> getState + meta' <- case Yaml.decodeEither' $ UTF8.fromString rawYaml of + Right (Yaml.Object hashmap) -> + foldrWithKeyM + (\k v m -> do + if ignorable k + then return m + else (do v' <- lift $ yamlToMeta opts v + return $ B.setMeta (T.unpack k) v' m) + `catchError` + (\_ -> return m) + ) nullMeta hashmap + Right Yaml.Null -> return nullMeta + Right _ -> do + logMessage $ + CouldNotParseYamlMetadata "not an object" + pos + return nullMeta + Left err' -> do + case err' of + InvalidYaml (Just YamlParseException{ + yamlProblem = problem + , yamlContext = _ctxt + , yamlProblemMark = Yaml.YamlMark { + yamlLine = yline + , yamlColumn = ycol + }}) -> + logMessage $ CouldNotParseYamlMetadata + problem (setSourceLine + (setSourceColumn pos + (sourceColumn pos + ycol)) + (sourceLine pos + 1 + yline)) + _ -> logMessage $ CouldNotParseYamlMetadata + (show err') pos + return nullMeta + updateState $ \st -> st{ stateMeta' = stateMeta' st <> (return meta') } + return mempty + +-- ignore fields ending with _ +ignorable :: Text -> Bool +ignorable t = (T.pack "_") `T.isSuffixOf` t + +toMetaValue :: PandocMonad m => ReaderOptions -> Text -> m MetaValue +toMetaValue opts x = toMeta <$> readMarkdown opts' (T.unpack x) + where + toMeta p = + case p of + Pandoc _ [Plain xs] -> MetaInlines xs + Pandoc _ [Para xs] + | endsWithNewline x -> MetaBlocks [Para xs] + | otherwise -> MetaInlines xs + Pandoc _ bs -> MetaBlocks bs + endsWithNewline t = T.pack "\n" `T.isSuffixOf` t + opts' = opts{readerExtensions = + disableExtension Ext_pandoc_title_block $ + disableExtension Ext_mmd_title_block $ + disableExtension Ext_yaml_metadata_block $ + readerExtensions opts } + +yamlToMeta :: PandocMonad m => ReaderOptions -> Yaml.Value -> m MetaValue +yamlToMeta opts (Yaml.String t) = toMetaValue opts t +yamlToMeta _ (Yaml.Number n) + -- avoid decimal points for numbers that don't need them: + | base10Exponent n >= 0 = return $ MetaString $ show + $ coefficient n * (10 ^ base10Exponent n) + | otherwise = return $ MetaString $ show n +yamlToMeta _ (Yaml.Bool b) = return $ MetaBool b +yamlToMeta opts (Yaml.Array xs) = B.toMetaValue <$> mapM (yamlToMeta opts) + (V.toList xs) +yamlToMeta opts (Yaml.Object o) = MetaMap <$> H.foldrWithKey (\k v m -> + if ignorable k + then m + else (do + v' <- yamlToMeta opts v + m' <- m + return (M.insert (T.unpack k) v' m'))) + (return M.empty) o +yamlToMeta _ _ = return $ MetaString "" + +stopLine :: PandocMonad m => MarkdownParser m () +stopLine = try $ (string "---" <|> string "...") >> blankline >> return () + +mmdTitleBlock :: PandocMonad m => MarkdownParser m () +mmdTitleBlock = try $ do + guardEnabled Ext_mmd_title_block + firstPair <- kvPair False + restPairs <- many (kvPair True) + let kvPairs = firstPair : restPairs + blanklines + updateState $ \st -> st{ stateMeta' = stateMeta' st <> + return (Meta $ M.fromList kvPairs) } + +kvPair :: PandocMonad m => Bool -> MarkdownParser m (String, MetaValue) +kvPair allowEmpty = try $ do + key <- many1Till (alphaNum <|> oneOf "_- ") (char ':') + val <- trim <$> manyTill anyChar + (try $ newline >> lookAhead (blankline <|> nonspaceChar)) + guard $ allowEmpty || not (null val) + let key' = concat $ words $ map toLower key + let val' = MetaBlocks $ B.toList $ B.plain $ B.text $ val + return (key',val') + +parseMarkdown :: PandocMonad m => MarkdownParser m Pandoc +parseMarkdown = do + optional titleBlock + blocks <- parseBlocks + st <- getState + let meta = runF (stateMeta' st) st + let Pandoc _ bs = B.doc $ runF blocks st + eastAsianLineBreaks <- option False $ + True <$ guardEnabled Ext_east_asian_line_breaks + reportLogMessages + return $ (if eastAsianLineBreaks + then bottomUp softBreakFilter + else id) $ Pandoc meta bs + +softBreakFilter :: [Inline] -> [Inline] +softBreakFilter (x:SoftBreak:y:zs) = + case (stringify x, stringify y) of + (xs@(_:_), (c:_)) + | charWidth (last xs) == 2 && charWidth c == 2 -> x:y:zs + _ -> x:SoftBreak:y:zs +softBreakFilter xs = xs + +referenceKey :: PandocMonad m => MarkdownParser m (F Blocks) +referenceKey = try $ do + pos <- getPosition + skipNonindentSpaces + (_,raw) <- reference + char ':' + skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[') + let sourceURL = liftM unwords $ many $ try $ do + skipMany spaceChar + notFollowedBy' referenceTitle + notFollowedBy' $ guardEnabled Ext_link_attributes >> attributes + notFollowedBy' (() <$ reference) + many1 $ notFollowedBy space >> litChar + let betweenAngles = try $ char '<' >> manyTill litChar (char '>') + src <- try betweenAngles <|> sourceURL + tit <- option "" referenceTitle + attr <- option nullAttr $ try $ + guardEnabled Ext_link_attributes >> skipSpaces >> attributes + addKvs <- option [] $ guardEnabled Ext_mmd_link_attributes + >> many (try $ spnl >> keyValAttr) + blanklines + let attr' = extractIdClass $ foldl (\x f -> f x) attr addKvs + target = (escapeURI $ trimr src, tit) + st <- getState + let oldkeys = stateKeys st + let key = toKey raw + case M.lookup key oldkeys of + Just _ -> logMessage $ DuplicateLinkReference raw pos + Nothing -> return () + updateState $ \s -> s { stateKeys = M.insert key (target, attr') oldkeys } + return $ return mempty + +referenceTitle :: PandocMonad m => MarkdownParser m String +referenceTitle = try $ do + skipSpaces >> optional newline >> skipSpaces + quotedTitle '"' <|> quotedTitle '\'' <|> charsInBalanced '(' ')' litChar + +-- A link title in quotes +quotedTitle :: PandocMonad m => Char -> MarkdownParser m String +quotedTitle c = try $ do + char c + notFollowedBy spaces + let pEnder = try $ char c >> notFollowedBy (satisfy isAlphaNum) + let regChunk = many1 (noneOf ['\\','\n','&',c]) <|> count 1 litChar + let nestedChunk = (\x -> [c] ++ x ++ [c]) <$> quotedTitle c + unwords . words . concat <$> manyTill (nestedChunk <|> regChunk) pEnder + +-- | PHP Markdown Extra style abbreviation key. Currently +-- we just skip them, since Pandoc doesn't have an element for +-- an abbreviation. +abbrevKey :: PandocMonad m => MarkdownParser m (F Blocks) +abbrevKey = do + guardEnabled Ext_abbreviations + try $ do + char '*' + reference + char ':' + skipMany (satisfy (/= '\n')) + blanklines + return $ return mempty + +noteMarker :: PandocMonad m => MarkdownParser m String +noteMarker = string "[^" >> many1Till (satisfy $ not . isBlank) (char ']') + +rawLine :: PandocMonad m => MarkdownParser m String +rawLine = try $ do + notFollowedBy blankline + notFollowedBy' $ try $ skipNonindentSpaces >> noteMarker + optional indentSpaces + anyLine + +rawLines :: PandocMonad m => MarkdownParser m String +rawLines = do + first <- anyLine + rest <- many rawLine + return $ unlines (first:rest) + +noteBlock :: PandocMonad m => MarkdownParser m (F Blocks) +noteBlock = try $ do + pos <- getPosition + skipNonindentSpaces + ref <- noteMarker + char ':' + optional blankline + optional indentSpaces + first <- rawLines + rest <- many $ try $ blanklines >> indentSpaces >> rawLines + let raw = unlines (first:rest) ++ "\n" + optional blanklines + parsed <- parseFromString parseBlocks raw + let newnote = (ref, parsed) + oldnotes <- stateNotes' <$> getState + case lookup ref oldnotes of + Just _ -> logMessage $ DuplicateNoteReference ref pos + Nothing -> return () + updateState $ \s -> s { stateNotes' = newnote : oldnotes } + return mempty + +-- +-- parsing blocks +-- + +parseBlocks :: PandocMonad m => MarkdownParser m (F Blocks) +parseBlocks = mconcat <$> manyTill block eof + +block :: PandocMonad m => MarkdownParser m (F Blocks) +block = do + pos <- getPosition + res <- choice [ mempty <$ blanklines + , codeBlockFenced + , yamlMetaBlock + -- note: bulletList needs to be before header because of + -- the possibility of empty list items: - + , bulletList + , header + , lhsCodeBlock + , divHtml + , htmlBlock + , table + , codeBlockIndented + , guardEnabled Ext_latex_macros *> (macro >>= return . return) + , rawTeXBlock + , lineBlock + , blockQuote + , hrule + , orderedList + , definitionList + , noteBlock + , referenceKey + , abbrevKey + , para + , plain + ] <?> "block" + report $ ParsingTrace + (take 60 $ show $ B.toList $ runF res defaultParserState) pos + return res + +-- +-- header blocks +-- + +header :: PandocMonad m => MarkdownParser m (F Blocks) +header = setextHeader <|> atxHeader <?> "header" + +atxChar :: PandocMonad m => MarkdownParser m Char +atxChar = do + exts <- getOption readerExtensions + return $ if extensionEnabled Ext_literate_haskell exts + then '=' + else '#' + +atxHeader :: PandocMonad m => MarkdownParser m (F Blocks) +atxHeader = try $ do + level <- atxChar >>= many1 . char >>= return . length + notFollowedBy $ guardEnabled Ext_fancy_lists >> + (char '.' <|> char ')') -- this would be a list + skipSpaces + (text, raw) <- withRaw $ + trimInlinesF . mconcat <$> many (notFollowedBy atxClosing >> inline) + attr <- atxClosing + attr' <- registerHeader attr (runF text defaultParserState) + guardDisabled Ext_implicit_header_references + <|> registerImplicitHeader raw attr' + return $ B.headerWith attr' level <$> text + +atxClosing :: PandocMonad m => MarkdownParser m Attr +atxClosing = try $ do + attr' <- option nullAttr + (guardEnabled Ext_mmd_header_identifiers >> mmdHeaderIdentifier) + skipMany . char =<< atxChar + skipSpaces + attr <- option attr' + (guardEnabled Ext_header_attributes >> attributes) + blanklines + return attr + +setextHeaderEnd :: PandocMonad m => MarkdownParser m Attr +setextHeaderEnd = try $ do + attr <- option nullAttr + $ (guardEnabled Ext_mmd_header_identifiers >> mmdHeaderIdentifier) + <|> (guardEnabled Ext_header_attributes >> attributes) + blanklines + return attr + +mmdHeaderIdentifier :: PandocMonad m => MarkdownParser m Attr +mmdHeaderIdentifier = do + ident <- stripFirstAndLast . snd <$> reference + skipSpaces + return (ident,[],[]) + +setextHeader :: PandocMonad m => MarkdownParser m (F Blocks) +setextHeader = try $ do + -- This lookahead prevents us from wasting time parsing Inlines + -- unless necessary -- it gives a significant performance boost. + lookAhead $ anyLine >> many1 (oneOf setextHChars) >> blankline + skipSpaces + (text, raw) <- withRaw $ + trimInlinesF . mconcat <$> many1 (notFollowedBy setextHeaderEnd >> inline) + attr <- setextHeaderEnd + underlineChar <- oneOf setextHChars + many (char underlineChar) + blanklines + let level = (fromMaybe 0 $ findIndex (== underlineChar) setextHChars) + 1 + attr' <- registerHeader attr (runF text defaultParserState) + guardDisabled Ext_implicit_header_references + <|> registerImplicitHeader raw attr' + return $ B.headerWith attr' level <$> text + +registerImplicitHeader :: PandocMonad m => String -> Attr -> MarkdownParser m () +registerImplicitHeader raw attr@(ident, _, _) = do + let key = toKey $ "[" ++ raw ++ "]" + updateState (\s -> s { stateHeaderKeys = + M.insert key (('#':ident,""), attr) (stateHeaderKeys s) }) + +-- +-- hrule block +-- + +hrule :: PandocMonad m => ParserT [Char] st m (F Blocks) +hrule = try $ do + skipSpaces + start <- satisfy isHruleChar + count 2 (skipSpaces >> char start) + skipMany (spaceChar <|> char start) + newline + optional blanklines + return $ return B.horizontalRule + +-- +-- code blocks +-- + +indentedLine :: PandocMonad m => MarkdownParser m String +indentedLine = indentSpaces >> anyLine >>= return . (++ "\n") + +blockDelimiter :: PandocMonad m + => (Char -> Bool) + -> Maybe Int + -> ParserT [Char] st m Int +blockDelimiter f len = try $ do + c <- lookAhead (satisfy f) + case len of + Just l -> count l (char c) >> many (char c) >> return l + Nothing -> count 3 (char c) >> many (char c) >>= + return . (+ 3) . length + +attributes :: PandocMonad m => MarkdownParser m Attr +attributes = try $ do + char '{' + spnl + attrs <- many (attribute <* spnl) + char '}' + return $ foldl (\x f -> f x) nullAttr attrs + +attribute :: PandocMonad m => MarkdownParser m (Attr -> Attr) +attribute = identifierAttr <|> classAttr <|> keyValAttr <|> specialAttr + +identifier :: PandocMonad m => MarkdownParser m String +identifier = do + first <- letter + rest <- many $ alphaNum <|> oneOf "-_:." + return (first:rest) + +identifierAttr :: PandocMonad m => MarkdownParser m (Attr -> Attr) +identifierAttr = try $ do + char '#' + result <- identifier + return $ \(_,cs,kvs) -> (result,cs,kvs) + +classAttr :: PandocMonad m => MarkdownParser m (Attr -> Attr) +classAttr = try $ do + char '.' + result <- identifier + return $ \(id',cs,kvs) -> (id',cs ++ [result],kvs) + +keyValAttr :: PandocMonad m => MarkdownParser m (Attr -> Attr) +keyValAttr = try $ do + key <- identifier + char '=' + val <- enclosed (char '"') (char '"') litChar + <|> enclosed (char '\'') (char '\'') litChar + <|> many (escapedChar' <|> noneOf " \t\n\r}") + return $ \(id',cs,kvs) -> + case key of + "id" -> (val,cs,kvs) + "class" -> (id',cs ++ words val,kvs) + _ -> (id',cs,kvs ++ [(key,val)]) + +specialAttr :: PandocMonad m => MarkdownParser m (Attr -> Attr) +specialAttr = do + char '-' + return $ \(id',cs,kvs) -> (id',cs ++ ["unnumbered"],kvs) + +codeBlockFenced :: PandocMonad m => MarkdownParser m (F Blocks) +codeBlockFenced = try $ do + c <- try (guardEnabled Ext_fenced_code_blocks >> lookAhead (char '~')) + <|> (guardEnabled Ext_backtick_code_blocks >> lookAhead (char '`')) + size <- blockDelimiter (== c) Nothing + skipMany spaceChar + attr <- option ([],[],[]) $ + try (guardEnabled Ext_fenced_code_attributes >> attributes) + <|> ((\x -> ("",[toLanguageId x],[])) <$> many1 nonspaceChar) + blankline + contents <- manyTill anyLine (blockDelimiter (== c) (Just size)) + blanklines + return $ return $ B.codeBlockWith attr $ intercalate "\n" contents + +-- correctly handle github language identifiers +toLanguageId :: String -> String +toLanguageId = map toLower . go + where go "c++" = "cpp" + go "objective-c" = "objectivec" + go x = x + +codeBlockIndented :: PandocMonad m => MarkdownParser m (F Blocks) +codeBlockIndented = do + contents <- many1 (indentedLine <|> + try (do b <- blanklines + l <- indentedLine + return $ b ++ l)) + optional blanklines + classes <- getOption readerIndentedCodeClasses + return $ return $ B.codeBlockWith ("", classes, []) $ + stripTrailingNewlines $ concat contents + +lhsCodeBlock :: PandocMonad m => MarkdownParser m (F Blocks) +lhsCodeBlock = do + guardEnabled Ext_literate_haskell + (return . B.codeBlockWith ("",["sourceCode","literate","haskell"],[]) <$> + (lhsCodeBlockBird <|> lhsCodeBlockLaTeX)) + <|> (return . B.codeBlockWith ("",["sourceCode","haskell"],[]) <$> + lhsCodeBlockInverseBird) + +lhsCodeBlockLaTeX :: PandocMonad m => MarkdownParser m String +lhsCodeBlockLaTeX = try $ do + string "\\begin{code}" + manyTill spaceChar newline + contents <- many1Till anyChar (try $ string "\\end{code}") + blanklines + return $ stripTrailingNewlines contents + +lhsCodeBlockBird :: PandocMonad m => MarkdownParser m String +lhsCodeBlockBird = lhsCodeBlockBirdWith '>' + +lhsCodeBlockInverseBird :: PandocMonad m => MarkdownParser m String +lhsCodeBlockInverseBird = lhsCodeBlockBirdWith '<' + +lhsCodeBlockBirdWith :: PandocMonad m => Char -> MarkdownParser m String +lhsCodeBlockBirdWith c = try $ do + pos <- getPosition + when (sourceColumn pos /= 1) $ fail "Not in first column" + lns <- many1 $ birdTrackLine c + -- if (as is normal) there is always a space after >, drop it + let lns' = if all (\ln -> null ln || take 1 ln == " ") lns + then map (drop 1) lns + else lns + blanklines + return $ intercalate "\n" lns' + +birdTrackLine :: PandocMonad m => Char -> ParserT [Char] st m String +birdTrackLine c = try $ do + char c + -- allow html tags on left margin: + when (c == '<') $ notFollowedBy letter + anyLine + +-- +-- block quotes +-- + +emailBlockQuoteStart :: PandocMonad m => MarkdownParser m Char +emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' <* optional (char ' ') + +emailBlockQuote :: PandocMonad m => MarkdownParser m [String] +emailBlockQuote = try $ do + emailBlockQuoteStart + let emailLine = many $ nonEndline <|> try + (endline >> notFollowedBy emailBlockQuoteStart >> + return '\n') + let emailSep = try (newline >> emailBlockQuoteStart) + first <- emailLine + rest <- many $ try $ emailSep >> emailLine + let raw = first:rest + newline <|> (eof >> return '\n') + optional blanklines + return raw + +blockQuote :: PandocMonad m => MarkdownParser m (F Blocks) +blockQuote = do + raw <- emailBlockQuote + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n" + return $ B.blockQuote <$> contents + +-- +-- list blocks +-- + +bulletListStart :: PandocMonad m => MarkdownParser m () +bulletListStart = try $ do + optional newline -- if preceded by a Plain block in a list context + startpos <- sourceColumn <$> getPosition + skipNonindentSpaces + notFollowedBy' (() <$ hrule) -- because hrules start out just like lists + satisfy isBulletListMarker + endpos <- sourceColumn <$> getPosition + tabStop <- getOption readerTabStop + lookAhead (newline <|> spaceChar) + () <$ atMostSpaces (tabStop - (endpos - startpos)) + +anyOrderedListStart :: PandocMonad m => MarkdownParser m (Int, ListNumberStyle, ListNumberDelim) +anyOrderedListStart = try $ do + optional newline -- if preceded by a Plain block in a list context + startpos <- sourceColumn <$> getPosition + skipNonindentSpaces + notFollowedBy $ string "p." >> spaceChar >> digit -- page number + res <- do guardDisabled Ext_fancy_lists + start <- many1 digit >>= safeRead + char '.' + return (start, DefaultStyle, DefaultDelim) + <|> do (num, style, delim) <- anyOrderedListMarker + -- if it could be an abbreviated first name, + -- insist on more than one space + when (delim == Period && (style == UpperAlpha || + (style == UpperRoman && + num `elem` [1, 5, 10, 50, 100, 500, 1000]))) $ + () <$ spaceChar + return (num, style, delim) + endpos <- sourceColumn <$> getPosition + tabStop <- getOption readerTabStop + lookAhead (newline <|> spaceChar) + atMostSpaces (tabStop - (endpos - startpos)) + return res + +listStart :: PandocMonad m => MarkdownParser m () +listStart = bulletListStart <|> (anyOrderedListStart >> return ()) + +listLine :: PandocMonad m => MarkdownParser m String +listLine = try $ do + notFollowedBy' (do indentSpaces + many spaceChar + listStart) + notFollowedByHtmlCloser + optional (() <$ indentSpaces) + listLineCommon + +listLineCommon :: PandocMonad m => MarkdownParser m String +listLineCommon = concat <$> manyTill + ( many1 (satisfy $ \c -> c /= '\n' && c /= '<') + <|> liftM snd (htmlTag isCommentTag) + <|> count 1 anyChar + ) newline + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem :: PandocMonad m + => MarkdownParser m a + -> MarkdownParser m String +rawListItem start = try $ do + start + first <- listLineCommon + rest <- many (notFollowedBy listStart >> notFollowedBy blankline >> listLine) + blanks <- many blankline + return $ unlines (first:rest) ++ blanks + +-- continuation of a list item - indented and separated by blankline +-- or (in compact lists) endline. +-- note: nested lists are parsed as continuations +listContinuation :: PandocMonad m => MarkdownParser m String +listContinuation = try $ do + lookAhead indentSpaces + result <- many1 listContinuationLine + blanks <- many blankline + return $ concat result ++ blanks + +notFollowedByHtmlCloser :: PandocMonad m => MarkdownParser m () +notFollowedByHtmlCloser = do + inHtmlBlock <- stateInHtmlBlock <$> getState + case inHtmlBlock of + Just t -> notFollowedBy' $ htmlTag (~== TagClose t) + Nothing -> return () + +listContinuationLine :: PandocMonad m => MarkdownParser m String +listContinuationLine = try $ do + notFollowedBy blankline + notFollowedBy' listStart + notFollowedByHtmlCloser + optional indentSpaces + result <- anyLine + return $ result ++ "\n" + +listItem :: PandocMonad m + => MarkdownParser m a + -> MarkdownParser m (F Blocks) +listItem start = try $ do + first <- rawListItem start + continuations <- many listContinuation + -- parsing with ListItemState forces markers at beginning of lines to + -- count as list item markers, even if not separated by blank space. + -- see definition of "endline" + state <- getState + let oldContext = stateParserContext state + setState $ state {stateParserContext = ListItemState} + -- parse the extracted block, which may contain various block elements: + let raw = concat (first:continuations) + contents <- parseFromString parseBlocks raw + updateState (\st -> st {stateParserContext = oldContext}) + return contents + +orderedList :: PandocMonad m => MarkdownParser m (F Blocks) +orderedList = try $ do + (start, style, delim) <- lookAhead anyOrderedListStart + unless (style `elem` [DefaultStyle, Decimal, Example] && + delim `elem` [DefaultDelim, Period]) $ + guardEnabled Ext_fancy_lists + when (style == Example) $ guardEnabled Ext_example_lists + items <- fmap sequence $ many1 $ listItem + ( try $ do + optional newline -- if preceded by Plain block in a list + startpos <- sourceColumn <$> getPosition + skipNonindentSpaces + res <- orderedListMarker style delim + endpos <- sourceColumn <$> getPosition + tabStop <- getOption readerTabStop + lookAhead (newline <|> spaceChar) + atMostSpaces (tabStop - (endpos - startpos)) + return res ) + start' <- option 1 $ guardEnabled Ext_startnum >> return start + return $ B.orderedListWith (start', style, delim) <$> fmap compactify items + +bulletList :: PandocMonad m => MarkdownParser m (F Blocks) +bulletList = do + items <- fmap sequence $ many1 $ listItem bulletListStart + return $ B.bulletList <$> fmap compactify items + +-- definition lists + +defListMarker :: PandocMonad m => MarkdownParser m () +defListMarker = do + sps <- nonindentSpaces + char ':' <|> char '~' + tabStop <- getOption readerTabStop + let remaining = tabStop - (length sps + 1) + if remaining > 0 + then try (count remaining (char ' ')) <|> string "\t" <|> many1 spaceChar + else mzero + return () + +definitionListItem :: PandocMonad m => Bool -> MarkdownParser m (F (Inlines, [Blocks])) +definitionListItem compact = try $ do + rawLine' <- anyLine + raw <- many1 $ defRawBlock compact + term <- parseFromString (trimInlinesF . mconcat <$> many inline) rawLine' + contents <- mapM (parseFromString parseBlocks . (++"\n")) raw + optional blanklines + return $ liftM2 (,) term (sequence contents) + +defRawBlock :: PandocMonad m => Bool -> MarkdownParser m String +defRawBlock compact = try $ do + hasBlank <- option False $ blankline >> return True + defListMarker + firstline <- anyLine + let dline = try + ( do notFollowedBy blankline + notFollowedByHtmlCloser + if compact -- laziness not compatible with compact + then () <$ indentSpaces + else (() <$ indentSpaces) + <|> notFollowedBy defListMarker + anyLine ) + rawlines <- many dline + cont <- liftM concat $ many $ try $ do + trailing <- option "" blanklines + ln <- indentSpaces >> notFollowedBy blankline >> anyLine + lns <- many dline + return $ trailing ++ unlines (ln:lns) + return $ trimr (firstline ++ "\n" ++ unlines rawlines ++ cont) ++ + if hasBlank || not (null cont) then "\n\n" else "" + +definitionList :: PandocMonad m => MarkdownParser m (F Blocks) +definitionList = try $ do + lookAhead (anyLine >> + optional (blankline >> notFollowedBy (table >> return ())) >> + -- don't capture table caption as def list! + defListMarker) + compactDefinitionList <|> normalDefinitionList + +compactDefinitionList :: PandocMonad m => MarkdownParser m (F Blocks) +compactDefinitionList = do + guardEnabled Ext_compact_definition_lists + items <- fmap sequence $ many1 $ definitionListItem True + return $ B.definitionList <$> fmap compactifyDL items + +normalDefinitionList :: PandocMonad m => MarkdownParser m (F Blocks) +normalDefinitionList = do + guardEnabled Ext_definition_lists + items <- fmap sequence $ many1 $ definitionListItem False + return $ B.definitionList <$> items + +-- +-- paragraph block +-- + +para :: PandocMonad m => MarkdownParser m (F Blocks) +para = try $ do + exts <- getOption readerExtensions + result <- trimInlinesF . mconcat <$> many1 inline + option (B.plain <$> result) + $ try $ do + newline + (blanklines >> return mempty) + <|> (guardDisabled Ext_blank_before_blockquote >> () <$ lookAhead blockQuote) + <|> (guardEnabled Ext_backtick_code_blocks >> () <$ lookAhead codeBlockFenced) + <|> (guardDisabled Ext_blank_before_header >> () <$ lookAhead header) + <|> (guardEnabled Ext_lists_without_preceding_blankline >> + -- Avoid creating a paragraph in a nested list. + notFollowedBy' inList >> + () <$ lookAhead listStart) + <|> do guardEnabled Ext_native_divs + inHtmlBlock <- stateInHtmlBlock <$> getState + case inHtmlBlock of + Just "div" -> () <$ + lookAhead (htmlTag (~== TagClose "div")) + _ -> mzero + return $ do + result' <- result + case B.toList result' of + [Image attr alt (src,tit)] + | Ext_implicit_figures `extensionEnabled` exts -> + -- the fig: at beginning of title indicates a figure + return $ B.para $ B.singleton + $ Image attr alt (src,'f':'i':'g':':':tit) + _ -> return $ B.para result' + +plain :: PandocMonad m => MarkdownParser m (F Blocks) +plain = fmap B.plain . trimInlinesF . mconcat <$> many1 inline + +-- +-- raw html +-- + +htmlElement :: PandocMonad m => MarkdownParser m String +htmlElement = rawVerbatimBlock + <|> strictHtmlBlock + <|> liftM snd (htmlTag isBlockTag) + +htmlBlock :: PandocMonad m => MarkdownParser m (F Blocks) +htmlBlock = do + guardEnabled Ext_raw_html + try (do + (TagOpen t attrs) <- lookAhead $ fst <$> htmlTag isBlockTag + (guard (t `elem` ["pre","style","script"]) >> + (return . B.rawBlock "html") <$> rawVerbatimBlock) + <|> (do guardEnabled Ext_markdown_attribute + oldMarkdownAttribute <- stateMarkdownAttribute <$> getState + markdownAttribute <- + case lookup "markdown" attrs of + Just "0" -> False <$ updateState (\st -> st{ + stateMarkdownAttribute = False }) + Just _ -> True <$ updateState (\st -> st{ + stateMarkdownAttribute = True }) + Nothing -> return oldMarkdownAttribute + res <- if markdownAttribute + then rawHtmlBlocks + else htmlBlock' + updateState $ \st -> st{ stateMarkdownAttribute = + oldMarkdownAttribute } + return res) + <|> (guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks)) + <|> htmlBlock' + +htmlBlock' :: PandocMonad m => MarkdownParser m (F Blocks) +htmlBlock' = try $ do + first <- htmlElement + skipMany spaceChar + optional blanklines + return $ return $ B.rawBlock "html" first + +strictHtmlBlock :: PandocMonad m => MarkdownParser m String +strictHtmlBlock = htmlInBalanced (not . isInlineTag) + +rawVerbatimBlock :: PandocMonad m => MarkdownParser m String +rawVerbatimBlock = htmlInBalanced isVerbTag + where isVerbTag (TagOpen "pre" _) = True + isVerbTag (TagOpen "style" _) = True + isVerbTag (TagOpen "script" _) = True + isVerbTag _ = False + +rawTeXBlock :: PandocMonad m => MarkdownParser m (F Blocks) +rawTeXBlock = do + guardEnabled Ext_raw_tex + result <- (B.rawBlock "latex" . concat <$> + rawLaTeXBlock `sepEndBy1` blankline) + <|> (B.rawBlock "context" . concat <$> + rawConTeXtEnvironment `sepEndBy1` blankline) + spaces + return $ return result + +rawHtmlBlocks :: PandocMonad m => MarkdownParser m (F Blocks) +rawHtmlBlocks = do + (TagOpen tagtype _, raw) <- htmlTag isBlockTag + -- try to find closing tag + -- we set stateInHtmlBlock so that closing tags that can be either block or + -- inline will not be parsed as inline tags + oldInHtmlBlock <- stateInHtmlBlock <$> getState + updateState $ \st -> st{ stateInHtmlBlock = Just tagtype } + let closer = htmlTag (\x -> x ~== TagClose tagtype) + contents <- mconcat <$> many (notFollowedBy' closer >> block) + result <- + (closer >>= \(_, rawcloser) -> return ( + return (B.rawBlock "html" $ stripMarkdownAttribute raw) <> + contents <> + return (B.rawBlock "html" rawcloser))) + <|> return (return (B.rawBlock "html" raw) <> contents) + updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock } + return result + +-- remove markdown="1" attribute +stripMarkdownAttribute :: String -> String +stripMarkdownAttribute s = renderTags' $ map filterAttrib $ parseTags s + where filterAttrib (TagOpen t as) = TagOpen t + [(k,v) | (k,v) <- as, k /= "markdown"] + filterAttrib x = x + +-- +-- line block +-- + +lineBlock :: PandocMonad m => MarkdownParser m (F Blocks) +lineBlock = try $ do + guardEnabled Ext_line_blocks + lines' <- lineBlockLines >>= + mapM (parseFromString (trimInlinesF . mconcat <$> many inline)) + return $ B.lineBlock <$> sequence lines' + +-- +-- Tables +-- + +-- Parse a dashed line with optional trailing spaces; return its length +-- and the length including trailing space. +dashedLine :: PandocMonad m + => Char + -> ParserT [Char] st m (Int, Int) +dashedLine ch = do + dashes <- many1 (char ch) + sp <- many spaceChar + let lengthDashes = length dashes + lengthSp = length sp + return (lengthDashes, lengthDashes + lengthSp) + +-- Parse a table header with dashed lines of '-' preceded by +-- one (or zero) line of text. +simpleTableHeader :: PandocMonad m + => Bool -- ^ Headerless table + -> MarkdownParser m (F [Blocks], [Alignment], [Int]) +simpleTableHeader headless = try $ do + rawContent <- if headless + then return "" + else anyLine + initSp <- nonindentSpaces + dashes <- many1 (dashedLine '-') + newline + let (lengths, lines') = unzip dashes + let indices = scanl (+) (length initSp) lines' + -- If no header, calculate alignment on basis of first row of text + rawHeads <- liftM (tail . splitStringByIndices (init indices)) $ + if headless + then lookAhead anyLine + else return rawContent + let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths + let rawHeads' = if headless + then replicate (length dashes) "" + else rawHeads + heads <- fmap sequence + $ mapM (parseFromString (mconcat <$> many plain)) + $ map trim rawHeads' + return (heads, aligns, indices) + +-- Returns an alignment type for a table, based on a list of strings +-- (the rows of the column header) and a number (the length of the +-- dashed line under the rows. +alignType :: [String] + -> Int + -> Alignment +alignType [] _ = AlignDefault +alignType strLst len = + let nonempties = filter (not . null) $ map trimr strLst + (leftSpace, rightSpace) = + case sortBy (comparing length) nonempties of + (x:_) -> (head x `elem` " \t", length x < len) + [] -> (False, False) + in case (leftSpace, rightSpace) of + (True, False) -> AlignRight + (False, True) -> AlignLeft + (True, True) -> AlignCenter + (False, False) -> AlignDefault + +-- Parse a table footer - dashed lines followed by blank line. +tableFooter :: PandocMonad m => MarkdownParser m String +tableFooter = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> blanklines + +-- Parse a table separator - dashed line. +tableSep :: PandocMonad m => MarkdownParser m Char +tableSep = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> char '\n' + +-- Parse a raw line and split it into chunks by indices. +rawTableLine :: PandocMonad m + => [Int] + -> MarkdownParser m [String] +rawTableLine indices = do + notFollowedBy' (blanklines <|> tableFooter) + line <- many1Till anyChar newline + return $ map trim $ tail $ + splitStringByIndices (init indices) line + +-- Parse a table line and return a list of lists of blocks (columns). +tableLine :: PandocMonad m + => [Int] + -> MarkdownParser m (F [Blocks]) +tableLine indices = rawTableLine indices >>= + fmap sequence . mapM (parseFromString (mconcat <$> many plain)) + +-- Parse a multiline table row and return a list of blocks (columns). +multilineRow :: PandocMonad m + => [Int] + -> MarkdownParser m (F [Blocks]) +multilineRow indices = do + colLines <- many1 (rawTableLine indices) + let cols = map unlines $ transpose colLines + fmap sequence $ mapM (parseFromString (mconcat <$> many plain)) cols + +-- Parses a table caption: inlines beginning with 'Table:' +-- and followed by blank lines. +tableCaption :: PandocMonad m => MarkdownParser m (F Inlines) +tableCaption = try $ do + guardEnabled Ext_table_captions + skipNonindentSpaces + string ":" <|> string "Table:" + trimInlinesF . mconcat <$> many1 inline <* blanklines + +-- Parse a simple table with '---' header and one line per row. +simpleTable :: PandocMonad m + => Bool -- ^ Headerless table + -> MarkdownParser m ([Alignment], [Double], F [Blocks], F [[Blocks]]) +simpleTable headless = do + (aligns, _widths, heads', lines') <- + tableWith (simpleTableHeader headless) tableLine + (return ()) + (if headless then tableFooter else tableFooter <|> blanklines) + -- Simple tables get 0s for relative column widths (i.e., use default) + return (aligns, replicate (length aligns) 0, heads', lines') + +-- Parse a multiline table: starts with row of '-' on top, then header +-- (which may be multiline), then the rows, +-- which may be multiline, separated by blank lines, and +-- ending with a footer (dashed line followed by blank line). +multilineTable :: PandocMonad m + => Bool -- ^ Headerless table + -> MarkdownParser m ([Alignment], [Double], F [Blocks], F [[Blocks]]) +multilineTable headless = + tableWith (multilineTableHeader headless) multilineRow blanklines tableFooter + +multilineTableHeader :: PandocMonad m + => Bool -- ^ Headerless table + -> MarkdownParser m (F [Blocks], [Alignment], [Int]) +multilineTableHeader headless = try $ do + unless headless $ + tableSep >> notFollowedBy blankline + rawContent <- if headless + then return $ repeat "" + else many1 $ notFollowedBy tableSep >> anyLine + initSp <- nonindentSpaces + dashes <- many1 (dashedLine '-') + newline + let (lengths, lines') = unzip dashes + let indices = scanl (+) (length initSp) lines' + rawHeadsList <- if headless + then liftM (map (:[]) . tail . + splitStringByIndices (init indices)) $ lookAhead anyLine + else return $ transpose $ map + (tail . splitStringByIndices (init indices)) + rawContent + let aligns = zipWith alignType rawHeadsList lengths + let rawHeads = if headless + then replicate (length dashes) "" + else map (unlines . map trim) rawHeadsList + heads <- fmap sequence $ + mapM (parseFromString (mconcat <$> many plain)) $ + map trim rawHeads + return (heads, aligns, indices) + +-- Parse a grid table: starts with row of '-' on top, then header +-- (which may be grid), then the rows, +-- which may be grid, separated by blank lines, and +-- ending with a footer (dashed line followed by blank line). +gridTable :: PandocMonad m => Bool -- ^ Headerless table + -> MarkdownParser m ([Alignment], [Double], F [Blocks], F [[Blocks]]) +gridTable headless = + tableWith (gridTableHeader headless) gridTableRow + (gridTableSep '-') gridTableFooter + +gridTableSplitLine :: [Int] -> String -> [String] +gridTableSplitLine indices line = map removeFinalBar $ tail $ + splitStringByIndices (init indices) $ trimr line + +gridPart :: PandocMonad m => Char -> ParserT [Char] st m ((Int, Int), Alignment) +gridPart ch = do + leftColon <- option False (True <$ char ':') + dashes <- many1 (char ch) + rightColon <- option False (True <$ char ':') + char '+' + let lengthDashes = length dashes + (if leftColon then 1 else 0) + + (if rightColon then 1 else 0) + let alignment = case (leftColon, rightColon) of + (True, True) -> AlignCenter + (True, False) -> AlignLeft + (False, True) -> AlignRight + (False, False) -> AlignDefault + return ((lengthDashes, lengthDashes + 1), alignment) + +gridDashedLines :: PandocMonad m => Char -> ParserT [Char] st m [((Int, Int), Alignment)] +gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline + +removeFinalBar :: String -> String +removeFinalBar = + reverse . dropWhile (`elem` " \t") . dropWhile (=='|') . reverse + +-- | Separator between rows of grid table. +gridTableSep :: PandocMonad m => Char -> MarkdownParser m Char +gridTableSep ch = try $ gridDashedLines ch >> return '\n' + +-- | Parse header for a grid table. +gridTableHeader :: PandocMonad m => Bool -- ^ Headerless table + -> MarkdownParser m (F [Blocks], [Alignment], [Int]) +gridTableHeader headless = try $ do + optional blanklines + dashes <- gridDashedLines '-' + rawContent <- if headless + then return [] + else many1 (try (char '|' >> anyLine)) + underDashes <- if headless + then return dashes + else gridDashedLines '=' + guard $ length dashes == length underDashes + let lines' = map (snd . fst) underDashes + let indices = scanl (+) 0 lines' + let aligns = map snd underDashes + let rawHeads = if headless + then replicate (length underDashes) "" + else map (unlines . map trim) $ transpose + $ map (gridTableSplitLine indices) rawContent + heads <- fmap sequence $ mapM (parseFromString parseBlocks . trim) rawHeads + return (heads, aligns, indices) + +gridTableRawLine :: PandocMonad m => [Int] -> MarkdownParser m [String] +gridTableRawLine indices = do + char '|' + line <- anyLine + return (gridTableSplitLine indices line) + +-- | Parse row of grid table. +gridTableRow :: PandocMonad m => [Int] + -> MarkdownParser m (F [Blocks]) +gridTableRow indices = do + colLines <- many1 (gridTableRawLine indices) + let cols = map ((++ "\n") . unlines . removeOneLeadingSpace) $ + transpose colLines + fmap compactify <$> fmap sequence (mapM (parseFromString parseBlocks) cols) + +removeOneLeadingSpace :: [String] -> [String] +removeOneLeadingSpace xs = + if all startsWithSpace xs + then map (drop 1) xs + else xs + where startsWithSpace "" = True + startsWithSpace (y:_) = y == ' ' + +-- | Parse footer for a grid table. +gridTableFooter :: PandocMonad m => MarkdownParser m [Char] +gridTableFooter = blanklines + +pipeBreak :: PandocMonad m => MarkdownParser m ([Alignment], [Int]) +pipeBreak = try $ do + nonindentSpaces + openPipe <- (True <$ char '|') <|> return False + first <- pipeTableHeaderPart + rest <- many $ sepPipe *> pipeTableHeaderPart + -- surrounding pipes needed for a one-column table: + guard $ not (null rest && not openPipe) + optional (char '|') + blankline + return $ unzip (first:rest) + +pipeTable :: PandocMonad m => MarkdownParser m ([Alignment], [Double], F [Blocks], F [[Blocks]]) +pipeTable = try $ do + nonindentSpaces + lookAhead nonspaceChar + (heads,(aligns, seplengths)) <- (,) <$> pipeTableRow <*> pipeBreak + let heads' = take (length aligns) <$> heads + lines' <- many pipeTableRow + let lines'' = map (take (length aligns) <$>) lines' + let maxlength = maximum $ + map (\x -> length . stringify $ runF x def) (heads' : lines'') + numColumns <- getOption readerColumns + let widths = if maxlength > numColumns + then map (\len -> + fromIntegral (len + 1) / fromIntegral numColumns) + seplengths + else replicate (length aligns) 0.0 + return $ (aligns, widths, heads', sequence lines'') + +sepPipe :: PandocMonad m => MarkdownParser m () +sepPipe = try $ do + char '|' <|> char '+' + notFollowedBy blankline + +-- parse a row, also returning probable alignments for org-table cells +pipeTableRow :: PandocMonad m => MarkdownParser m (F [Blocks]) +pipeTableRow = try $ do + scanForPipe + skipMany spaceChar + openPipe <- (True <$ char '|') <|> return False + -- split into cells + let chunk = void (code <|> rawHtmlInline <|> escapedChar <|> rawLaTeXInline') + <|> void (noneOf "|\n\r") + let cellContents = ((trim . snd) <$> withRaw (many chunk)) >>= + parseFromString pipeTableCell + cells <- cellContents `sepEndBy1` (char '|') + -- surrounding pipes needed for a one-column table: + guard $ not (length cells == 1 && not openPipe) + blankline + return $ sequence cells + +pipeTableCell :: PandocMonad m => MarkdownParser m (F Blocks) +pipeTableCell = do + result <- many inline + if null result + then return mempty + else return $ B.plain . mconcat <$> sequence result + +pipeTableHeaderPart :: PandocMonad m => ParserT [Char] st m (Alignment, Int) +pipeTableHeaderPart = try $ do + skipMany spaceChar + left <- optionMaybe (char ':') + pipe <- many1 (char '-') + right <- optionMaybe (char ':') + skipMany spaceChar + let len = length pipe + maybe 0 (const 1) left + maybe 0 (const 1) right + return $ + ((case (left,right) of + (Nothing,Nothing) -> AlignDefault + (Just _,Nothing) -> AlignLeft + (Nothing,Just _) -> AlignRight + (Just _,Just _) -> AlignCenter), len) + +-- Succeed only if current line contains a pipe. +scanForPipe :: PandocMonad m => ParserT [Char] st m () +scanForPipe = do + inp <- getInput + case break (\c -> c == '\n' || c == '|') inp of + (_,'|':_) -> return () + _ -> mzero + +-- | Parse a table using 'headerParser', 'rowParser', +-- 'lineParser', and 'footerParser'. Variant of the version in +-- Text.Pandoc.Parsing. +tableWith :: PandocMonad m + => MarkdownParser m (F [Blocks], [Alignment], [Int]) + -> ([Int] -> MarkdownParser m (F [Blocks])) + -> MarkdownParser m sep + -> MarkdownParser m end + -> MarkdownParser m ([Alignment], [Double], F [Blocks], F [[Blocks]]) +tableWith headerParser rowParser lineParser footerParser = try $ do + (heads, aligns, indices) <- headerParser + lines' <- fmap sequence $ rowParser indices `sepEndBy1` lineParser + footerParser + numColumns <- getOption readerColumns + let widths = if (indices == []) + then replicate (length aligns) 0.0 + else widthsFromIndices numColumns indices + return $ (aligns, widths, heads, lines') + +table :: PandocMonad m => MarkdownParser m (F Blocks) +table = try $ do + frontCaption <- option Nothing (Just <$> tableCaption) + (aligns, widths, heads, lns) <- + try (guardEnabled Ext_pipe_tables >> scanForPipe >> pipeTable) <|> + try (guardEnabled Ext_multiline_tables >> + multilineTable False) <|> + try (guardEnabled Ext_simple_tables >> + (simpleTable True <|> simpleTable False)) <|> + try (guardEnabled Ext_multiline_tables >> + multilineTable True) <|> + try (guardEnabled Ext_grid_tables >> + (gridTable False <|> gridTable True)) <?> "table" + optional blanklines + caption <- case frontCaption of + Nothing -> option (return mempty) tableCaption + Just c -> return c + -- renormalize widths if greater than 100%: + let totalWidth = sum widths + let widths' = if totalWidth < 1 + then widths + else map (/ totalWidth) widths + return $ do + caption' <- caption + heads' <- heads + lns' <- lns + return $ B.table caption' (zip aligns widths') heads' lns' + +-- +-- inline +-- + +inline :: PandocMonad m => MarkdownParser m (F Inlines) +inline = choice [ whitespace + , bareURL + , str + , endline + , code + , strongOrEmph + , note + , cite + , bracketedSpan + , link + , image + , math + , strikeout + , subscript + , superscript + , inlineNote -- after superscript because of ^[link](/foo)^ + , autoLink + , spanHtml + , rawHtmlInline + , escapedChar + , rawLaTeXInline' + , exampleRef + , smart + , return . B.singleton <$> charRef + , emoji + , symbol + , ltSign + ] <?> "inline" + +escapedChar' :: PandocMonad m => MarkdownParser m Char +escapedChar' = try $ do + char '\\' + (guardEnabled Ext_all_symbols_escapable >> satisfy (not . isAlphaNum)) + <|> (guardEnabled Ext_angle_brackets_escapable >> + oneOf "\\`*_{}[]()>#+-.!~\"<>") + <|> (guardEnabled Ext_escaped_line_breaks >> char '\n') + <|> oneOf "\\`*_{}[]()>#+-.!~\"" + +escapedChar :: PandocMonad m => MarkdownParser m (F Inlines) +escapedChar = do + result <- escapedChar' + case result of + ' ' -> return $ return $ B.str "\160" -- "\ " is a nonbreaking space + '\n' -> guardEnabled Ext_escaped_line_breaks >> + return (return B.linebreak) -- "\[newline]" is a linebreak + _ -> return $ return $ B.str [result] + +ltSign :: PandocMonad m => MarkdownParser m (F Inlines) +ltSign = do + guardDisabled Ext_raw_html + <|> (notFollowedByHtmlCloser >> notFollowedBy' (htmlTag isBlockTag)) + char '<' + return $ return $ B.str "<" + +exampleRef :: PandocMonad m => MarkdownParser m (F Inlines) +exampleRef = try $ do + guardEnabled Ext_example_lists + char '@' + lab <- many1 (alphaNum <|> oneOf "-_") + return $ do + st <- askF + return $ case M.lookup lab (stateExamples st) of + Just n -> B.str (show n) + Nothing -> B.str ('@':lab) + +symbol :: PandocMonad m => MarkdownParser m (F Inlines) +symbol = do + result <- noneOf "<\\\n\t " + <|> try (do lookAhead $ char '\\' + notFollowedBy' (() <$ rawTeXBlock) + char '\\') + return $ return $ B.str [result] + +-- parses inline code, between n `s and n `s +code :: PandocMonad m => MarkdownParser m (F Inlines) +code = try $ do + starts <- many1 (char '`') + skipSpaces + result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|> + (char '\n' >> notFollowedBy' blankline >> return " ")) + (try (skipSpaces >> count (length starts) (char '`') >> + notFollowedBy (char '`'))) + attr <- option ([],[],[]) (try $ guardEnabled Ext_inline_code_attributes + >> attributes) + return $ return $ B.codeWith attr $ trim $ concat result + +math :: PandocMonad m => MarkdownParser m (F Inlines) +math = (return . B.displayMath <$> (mathDisplay >>= applyMacros')) + <|> (return . B.math <$> (mathInline >>= applyMacros')) <+?> + (guardEnabled Ext_smart *> (return <$> apostrophe) + <* notFollowedBy (space <|> satisfy isPunctuation)) + +-- Parses material enclosed in *s, **s, _s, or __s. +-- Designed to avoid backtracking. +enclosure :: PandocMonad m + => Char + -> MarkdownParser m (F Inlines) +enclosure c = do + -- we can't start an enclosure with _ if after a string and + -- the intraword_underscores extension is enabled: + guardDisabled Ext_intraword_underscores + <|> guard (c == '*') + <|> (guard =<< notAfterString) + cs <- many1 (char c) + (return (B.str cs) <>) <$> whitespace + <|> do + case length cs of + 3 -> three c + 2 -> two c mempty + 1 -> one c mempty + _ -> return (return $ B.str cs) + +ender :: PandocMonad m => Char -> Int -> MarkdownParser m () +ender c n = try $ do + count n (char c) + guard (c == '*') + <|> guardDisabled Ext_intraword_underscores + <|> notFollowedBy alphaNum + +-- Parse inlines til you hit one c or a sequence of two cs. +-- If one c, emit emph and then parse two. +-- If two cs, emit strong and then parse one. +-- Otherwise, emit ccc then the results. +three :: PandocMonad m => Char -> MarkdownParser m (F Inlines) +three c = do + contents <- mconcat <$> many (notFollowedBy (ender c 1) >> inline) + (ender c 3 >> return ((B.strong . B.emph) <$> contents)) + <|> (ender c 2 >> one c (B.strong <$> contents)) + <|> (ender c 1 >> two c (B.emph <$> contents)) + <|> return (return (B.str [c,c,c]) <> contents) + +-- Parse inlines til you hit two c's, and emit strong. +-- If you never do hit two cs, emit ** plus inlines parsed. +two :: PandocMonad m => Char -> F Inlines -> MarkdownParser m (F Inlines) +two c prefix' = do + contents <- mconcat <$> many (try $ notFollowedBy (ender c 2) >> inline) + (ender c 2 >> return (B.strong <$> (prefix' <> contents))) + <|> return (return (B.str [c,c]) <> (prefix' <> contents)) + +-- Parse inlines til you hit a c, and emit emph. +-- If you never hit a c, emit * plus inlines parsed. +one :: PandocMonad m => Char -> F Inlines -> MarkdownParser m (F Inlines) +one c prefix' = do + contents <- mconcat <$> many ( (notFollowedBy (ender c 1) >> inline) + <|> try (string [c,c] >> + notFollowedBy (ender c 1) >> + two c mempty) ) + (ender c 1 >> return (B.emph <$> (prefix' <> contents))) + <|> return (return (B.str [c]) <> (prefix' <> contents)) + +strongOrEmph :: PandocMonad m => MarkdownParser m (F Inlines) +strongOrEmph = enclosure '*' <|> enclosure '_' + +-- | Parses a list of inlines between start and end delimiters. +inlinesBetween :: PandocMonad m + => (Show b) + => MarkdownParser m a + -> MarkdownParser m b + -> MarkdownParser m (F Inlines) +inlinesBetween start end = + (trimInlinesF . mconcat) <$> try (start >> many1Till inner end) + where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) + innerSpace = try $ whitespace <* notFollowedBy' end + +strikeout :: PandocMonad m => MarkdownParser m (F Inlines) +strikeout = fmap B.strikeout <$> + (guardEnabled Ext_strikeout >> inlinesBetween strikeStart strikeEnd) + where strikeStart = string "~~" >> lookAhead nonspaceChar + >> notFollowedBy (char '~') + strikeEnd = try $ string "~~" + +superscript :: PandocMonad m => MarkdownParser m (F Inlines) +superscript = fmap B.superscript <$> try (do + guardEnabled Ext_superscript + char '^' + mconcat <$> many1Till (notFollowedBy spaceChar >> inline) (char '^')) + +subscript :: PandocMonad m => MarkdownParser m (F Inlines) +subscript = fmap B.subscript <$> try (do + guardEnabled Ext_subscript + char '~' + mconcat <$> many1Till (notFollowedBy spaceChar >> inline) (char '~')) + +whitespace :: PandocMonad m => MarkdownParser m (F Inlines) +whitespace = spaceChar >> return <$> (lb <|> regsp) <?> "whitespace" + where lb = spaceChar >> skipMany spaceChar >> option B.space (endline >> return B.linebreak) + regsp = skipMany spaceChar >> return B.space + +nonEndline :: PandocMonad m => ParserT [Char] st m Char +nonEndline = satisfy (/='\n') + +str :: PandocMonad m => MarkdownParser m (F Inlines) +str = do + result <- many1 alphaNum + updateLastStrPos + let spacesToNbr = map (\c -> if c == ' ' then '\160' else c) + isSmart <- extensionEnabled Ext_smart <$> getOption readerExtensions + if isSmart + then case likelyAbbrev result of + [] -> return $ return $ B.str result + xs -> choice (map (\x -> + try (string x >> oneOf " \n" >> + lookAhead alphaNum >> + return (return $ B.str + $ result ++ spacesToNbr x ++ "\160"))) xs) + <|> (return $ return $ B.str result) + else return $ return $ B.str result + +-- | if the string matches the beginning of an abbreviation (before +-- the first period, return strings that would finish the abbreviation. +likelyAbbrev :: String -> [String] +likelyAbbrev x = + let abbrevs = [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.", + "Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.", + "vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.", + "Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.", + "ch.", "sec.", "cf.", "cp."] + abbrPairs = map (break (=='.')) abbrevs + in map snd $ filter (\(y,_) -> y == x) abbrPairs + +-- an endline character that can be treated as a space, not a structural break +endline :: PandocMonad m => MarkdownParser m (F Inlines) +endline = try $ do + newline + notFollowedBy blankline + -- parse potential list-starts differently if in a list: + notFollowedBy (inList >> listStart) + guardDisabled Ext_lists_without_preceding_blankline <|> notFollowedBy listStart + guardEnabled Ext_blank_before_blockquote <|> notFollowedBy emailBlockQuoteStart + guardEnabled Ext_blank_before_header <|> (notFollowedBy . char =<< atxChar) -- atx header + guardDisabled Ext_backtick_code_blocks <|> + notFollowedBy (() <$ (lookAhead (char '`') >> codeBlockFenced)) + notFollowedByHtmlCloser + (eof >> return mempty) + <|> (guardEnabled Ext_hard_line_breaks >> return (return B.linebreak)) + <|> (guardEnabled Ext_ignore_line_breaks >> return mempty) + <|> (skipMany spaceChar >> return (return B.softbreak)) + +-- +-- links +-- + +-- a reference label for a link +reference :: PandocMonad m => MarkdownParser m (F Inlines, String) +reference = do notFollowedBy' (string "[^") -- footnote reference + withRaw $ trimInlinesF <$> inlinesInBalancedBrackets + +parenthesizedChars :: PandocMonad m => MarkdownParser m [Char] +parenthesizedChars = do + result <- charsInBalanced '(' ')' litChar + return $ '(' : result ++ ")" + +-- source for a link, with optional title +source :: PandocMonad m => MarkdownParser m (String, String) +source = do + char '(' + skipSpaces + let urlChunk = + try parenthesizedChars + <|> (notFollowedBy (oneOf " )") >> (count 1 litChar)) + <|> try (many1 spaceChar <* notFollowedBy (oneOf "\"')")) + let sourceURL = (unwords . words . concat) <$> many urlChunk + let betweenAngles = try $ + char '<' >> manyTill litChar (char '>') + src <- try betweenAngles <|> sourceURL + tit <- option "" $ try $ spnl >> linkTitle + skipSpaces + char ')' + return (escapeURI $ trimr src, tit) + +linkTitle :: PandocMonad m => MarkdownParser m String +linkTitle = quotedTitle '"' <|> quotedTitle '\'' + +link :: PandocMonad m => MarkdownParser m (F Inlines) +link = try $ do + st <- getState + guard $ stateAllowLinks st + setState $ st{ stateAllowLinks = False } + (lab,raw) <- reference + setState $ st{ stateAllowLinks = True } + regLink B.linkWith lab <|> referenceLink B.linkWith (lab,raw) + +bracketedSpan :: PandocMonad m => MarkdownParser m (F Inlines) +bracketedSpan = try $ do + guardEnabled Ext_bracketed_spans + (lab,_) <- reference + attr <- attributes + let (ident,classes,keyvals) = attr + case lookup "style" keyvals of + Just s | null ident && null classes && + map toLower (filter (`notElem` " \t;") s) == + "font-variant:small-caps" + -> return $ B.smallcaps <$> lab + _ -> return $ B.spanWith attr <$> lab + +regLink :: PandocMonad m + => (Attr -> String -> String -> Inlines -> Inlines) + -> F Inlines + -> MarkdownParser m (F Inlines) +regLink constructor lab = try $ do + (src, tit) <- source + attr <- option nullAttr $ + guardEnabled Ext_link_attributes >> attributes + return $ constructor attr src tit <$> lab + +-- a link like [this][ref] or [this][] or [this] +referenceLink :: PandocMonad m + => (Attr -> String -> String -> Inlines -> Inlines) + -> (F Inlines, String) + -> MarkdownParser m (F Inlines) +referenceLink constructor (lab, raw) = do + sp <- (True <$ lookAhead (char ' ')) <|> return False + (_,raw') <- option (mempty, "") $ + lookAhead (try (guardEnabled Ext_citations >> + spnl >> normalCite >> return (mempty, ""))) + <|> + try (spnl >> reference) + when (raw' == "") $ guardEnabled Ext_shortcut_reference_links + let labIsRef = raw' == "" || raw' == "[]" + let key = toKey $ if labIsRef then raw else raw' + parsedRaw <- parseFromString (mconcat <$> many inline) raw' + fallback <- parseFromString (mconcat <$> many inline) $ dropBrackets raw + implicitHeaderRefs <- option False $ + True <$ guardEnabled Ext_implicit_header_references + let makeFallback = do + parsedRaw' <- parsedRaw + fallback' <- fallback + return $ B.str "[" <> fallback' <> B.str "]" <> + (if sp && not (null raw) then B.space else mempty) <> + parsedRaw' + return $ do + keys <- asksF stateKeys + case M.lookup key keys of + Nothing -> + if implicitHeaderRefs + then do + headerKeys <- asksF stateHeaderKeys + case M.lookup key headerKeys of + Just ((src, tit), _) -> constructor nullAttr src tit <$> lab + Nothing -> makeFallback + else makeFallback + Just ((src,tit), attr) -> constructor attr src tit <$> lab + +dropBrackets :: String -> String +dropBrackets = reverse . dropRB . reverse . dropLB + where dropRB (']':xs) = xs + dropRB xs = xs + dropLB ('[':xs) = xs + dropLB xs = xs + +bareURL :: PandocMonad m => MarkdownParser m (F Inlines) +bareURL = try $ do + guardEnabled Ext_autolink_bare_uris + getState >>= guard . stateAllowLinks + (orig, src) <- uri <|> emailAddress + notFollowedBy $ try $ spaces >> htmlTag (~== TagClose "a") + return $ return $ B.link src "" (B.str orig) + +autoLink :: PandocMonad m => MarkdownParser m (F Inlines) +autoLink = try $ do + getState >>= guard . stateAllowLinks + char '<' + (orig, src) <- uri <|> emailAddress + -- in rare cases, something may remain after the uri parser + -- is finished, because the uri parser tries to avoid parsing + -- final punctuation. for example: in `<http://hi---there>`, + -- the URI parser will stop before the dashes. + extra <- fromEntities <$> manyTill nonspaceChar (char '>') + attr <- option nullAttr $ try $ + guardEnabled Ext_link_attributes >> attributes + return $ return $ B.linkWith attr (src ++ escapeURI extra) "" (B.str $ orig ++ extra) + +image :: PandocMonad m => MarkdownParser m (F Inlines) +image = try $ do + char '!' + (lab,raw) <- reference + defaultExt <- getOption readerDefaultImageExtension + let constructor attr' src = case takeExtension src of + "" -> B.imageWith attr' (addExtension src defaultExt) + _ -> B.imageWith attr' src + regLink constructor lab <|> referenceLink constructor (lab,raw) + +note :: PandocMonad m => MarkdownParser m (F Inlines) +note = try $ do + guardEnabled Ext_footnotes + ref <- noteMarker + return $ do + notes <- asksF stateNotes' + case lookup ref notes of + Nothing -> return $ B.str $ "[^" ++ ref ++ "]" + Just contents -> do + st <- askF + -- process the note in a context that doesn't resolve + -- notes, to avoid infinite looping with notes inside + -- notes: + let contents' = runF contents st{ stateNotes' = [] } + return $ B.note contents' + +inlineNote :: PandocMonad m => MarkdownParser m (F Inlines) +inlineNote = try $ do + guardEnabled Ext_inline_notes + char '^' + contents <- inlinesInBalancedBrackets + return $ B.note . B.para <$> contents + +rawLaTeXInline' :: PandocMonad m => MarkdownParser m (F Inlines) +rawLaTeXInline' = try $ do + guardEnabled Ext_raw_tex + lookAhead $ char '\\' >> notFollowedBy' (string "start") -- context env + RawInline _ s <- rawLaTeXInline + return $ return $ B.rawInline "tex" s + -- "tex" because it might be context or latex + +rawConTeXtEnvironment :: PandocMonad m => ParserT [Char] st m String +rawConTeXtEnvironment = try $ do + string "\\start" + completion <- inBrackets (letter <|> digit <|> spaceChar) + <|> (many1 letter) + contents <- manyTill (rawConTeXtEnvironment <|> (count 1 anyChar)) + (try $ string "\\stop" >> string completion) + return $ "\\start" ++ completion ++ concat contents ++ "\\stop" ++ completion + +inBrackets :: PandocMonad m => (ParserT [Char] st m Char) -> ParserT [Char] st m String +inBrackets parser = do + char '[' + contents <- many parser + char ']' + return $ "[" ++ contents ++ "]" + +spanHtml :: PandocMonad m => MarkdownParser m (F Inlines) +spanHtml = try $ do + guardEnabled Ext_native_spans + (TagOpen _ attrs, _) <- htmlTag (~== TagOpen "span" []) + contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose "span")) + let ident = fromMaybe "" $ lookup "id" attrs + let classes = maybe [] words $ lookup "class" attrs + let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + case lookup "style" keyvals of + Just s | null ident && null classes && + map toLower (filter (`notElem` " \t;") s) == + "font-variant:small-caps" + -> return $ B.smallcaps <$> contents + _ -> return $ B.spanWith (ident, classes, keyvals) <$> contents + +divHtml :: PandocMonad m => MarkdownParser m (F Blocks) +divHtml = try $ do + guardEnabled Ext_native_divs + (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen "div" []) + -- we set stateInHtmlBlock so that closing tags that can be either block or + -- inline will not be parsed as inline tags + oldInHtmlBlock <- stateInHtmlBlock <$> getState + updateState $ \st -> st{ stateInHtmlBlock = Just "div" } + bls <- option "" (blankline >> option "" blanklines) + contents <- mconcat <$> + many (notFollowedBy' (htmlTag (~== TagClose "div")) >> block) + closed <- option False (True <$ htmlTag (~== TagClose "div")) + if closed + then do + updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock } + let ident = fromMaybe "" $ lookup "id" attrs + let classes = maybe [] words $ lookup "class" attrs + let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + return $ B.divWith (ident, classes, keyvals) <$> contents + else -- avoid backtracing + return $ return (B.rawBlock "html" (rawtag <> bls)) <> contents + +rawHtmlInline :: PandocMonad m => MarkdownParser m (F Inlines) +rawHtmlInline = do + guardEnabled Ext_raw_html + inHtmlBlock <- stateInHtmlBlock <$> getState + let isCloseBlockTag t = case inHtmlBlock of + Just t' -> t ~== TagClose t' + Nothing -> False + mdInHtml <- option False $ + ( guardEnabled Ext_markdown_in_html_blocks + <|> guardEnabled Ext_markdown_attribute + ) >> return True + (_,result) <- htmlTag $ if mdInHtml + then (\x -> isInlineTag x && + not (isCloseBlockTag x)) + else not . isTextTag + return $ return $ B.rawInline "html" result + +-- Emoji + +emojiChars :: [Char] +emojiChars = ['a'..'z'] ++ ['0'..'9'] ++ ['_','+','-'] + +emoji :: PandocMonad m => MarkdownParser m (F Inlines) +emoji = try $ do + guardEnabled Ext_emoji + char ':' + emojikey <- many1 (oneOf emojiChars) + char ':' + case M.lookup emojikey emojis of + Just s -> return (return (B.str s)) + Nothing -> mzero + +-- Citations + +cite :: PandocMonad m => MarkdownParser m (F Inlines) +cite = do + guardEnabled Ext_citations + citations <- textualCite + <|> do (cs, raw) <- withRaw normalCite + return $ (flip B.cite (B.text raw)) <$> cs + return citations + +textualCite :: PandocMonad m => MarkdownParser m (F Inlines) +textualCite = try $ do + (_, key) <- citeKey + let first = Citation{ citationId = key + , citationPrefix = [] + , citationSuffix = [] + , citationMode = AuthorInText + , citationNoteNum = 0 + , citationHash = 0 + } + mbrest <- option Nothing $ try $ spnl >> Just <$> withRaw normalCite + case mbrest of + Just (rest, raw) -> + return $ (flip B.cite (B.text $ '@':key ++ " " ++ raw) . (first:)) + <$> rest + Nothing -> + (do + (cs, raw) <- withRaw $ bareloc first + let (spaces',raw') = span isSpace raw + spc | null spaces' = mempty + | otherwise = B.space + lab <- parseFromString (mconcat <$> many inline) $ dropBrackets raw' + fallback <- referenceLink B.linkWith (lab,raw') + return $ do + fallback' <- fallback + cs' <- cs + return $ + case B.toList fallback' of + Link{}:_ -> B.cite [first] (B.str $ '@':key) <> spc <> fallback' + _ -> B.cite cs' (B.text $ '@':key ++ " " ++ raw)) + <|> return (do st <- askF + return $ case M.lookup key (stateExamples st) of + Just n -> B.str (show n) + _ -> B.cite [first] $ B.str $ '@':key) + +bareloc :: PandocMonad m => Citation -> MarkdownParser m (F [Citation]) +bareloc c = try $ do + spnl + char '[' + notFollowedBy $ char '^' + suff <- suffix + rest <- option (return []) $ try $ char ';' >> citeList + spnl + char ']' + notFollowedBy $ oneOf "[(" + return $ do + suff' <- suff + rest' <- rest + return $ c{ citationSuffix = B.toList suff' } : rest' + +normalCite :: PandocMonad m => MarkdownParser m (F [Citation]) +normalCite = try $ do + char '[' + spnl + citations <- citeList + spnl + char ']' + return citations + +suffix :: PandocMonad m => MarkdownParser m (F Inlines) +suffix = try $ do + hasSpace <- option False (notFollowedBy nonspaceChar >> return True) + spnl + rest <- trimInlinesF . mconcat <$> many (notFollowedBy (oneOf ";]") >> inline) + return $ if hasSpace + then (B.space <>) <$> rest + else rest + +prefix :: PandocMonad m => MarkdownParser m (F Inlines) +prefix = trimInlinesF . mconcat <$> + manyTill inline (char ']' <|> liftM (const ']') (lookAhead citeKey)) + +citeList :: PandocMonad m => MarkdownParser m (F [Citation]) +citeList = fmap sequence $ sepBy1 citation (try $ char ';' >> spnl) + +citation :: PandocMonad m => MarkdownParser m (F Citation) +citation = try $ do + pref <- prefix + (suppress_author, key) <- citeKey + suff <- suffix + return $ do + x <- pref + y <- suff + return $ Citation{ citationId = key + , citationPrefix = B.toList x + , citationSuffix = B.toList y + , citationMode = if suppress_author + then SuppressAuthor + else NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + +smart :: PandocMonad m => MarkdownParser m (F Inlines) +smart = do + guardEnabled Ext_smart + doubleQuoted <|> singleQuoted <|> + choice (map (return <$>) [apostrophe, dash, ellipses]) + +singleQuoted :: PandocMonad m => MarkdownParser m (F Inlines) +singleQuoted = try $ do + singleQuoteStart + withQuoteContext InSingleQuote $ + fmap B.singleQuoted . trimInlinesF . mconcat <$> + many1Till inline singleQuoteEnd + +-- doubleQuoted will handle regular double-quoted sections, as well +-- as dialogues with an open double-quote without a close double-quote +-- in the same paragraph. +doubleQuoted :: PandocMonad m => MarkdownParser m (F Inlines) +doubleQuoted = try $ do + doubleQuoteStart + contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) + (withQuoteContext InDoubleQuote $ doubleQuoteEnd >> return + (fmap B.doubleQuoted . trimInlinesF $ contents)) + <|> (return $ return (B.str "\8220") <> contents) diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs new file mode 100644 index 000000000..14f9da9b6 --- /dev/null +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -0,0 +1,677 @@ +{-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances #-} +-- RelaxedPolyRec needed for inlinesBetween on GHC < 7 +{- + Copyright (C) 2012-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.MediaWiki + Copyright : Copyright (C) 2012-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of mediawiki text to 'Pandoc' document. +-} +{- +TODO: +_ correctly handle tables within tables +_ parse templates? +-} +module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where + +import Text.Pandoc.Definition +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder (Inlines, Blocks, trimInlines) +import Data.Monoid ((<>)) +import Text.Pandoc.Options +import Text.Pandoc.Logging +import Text.Pandoc.Readers.HTML ( htmlTag, isBlockTag, isCommentTag ) +import Text.Pandoc.XML ( fromEntities ) +import Text.Pandoc.Parsing hiding ( nested ) +import Text.Pandoc.Walk ( walk ) +import Text.Pandoc.Shared ( stripTrailingNewlines, safeRead, stringify, trim ) +import Control.Monad +import Data.List (intersperse, intercalate, isPrefixOf ) +import Text.HTML.TagSoup +import Data.Sequence (viewl, ViewL(..), (<|)) +import qualified Data.Foldable as F +import qualified Data.Map as M +import qualified Data.Set as Set +import Data.Char (isDigit, isSpace) +import Data.Maybe (fromMaybe) +import Control.Monad.Except (throwError) +import Text.Pandoc.Class (PandocMonad, report) + +-- | Read mediawiki from an input string and return a Pandoc document. +readMediaWiki :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> m Pandoc +readMediaWiki opts s = do + parsed <- readWithM parseMediaWiki MWState{ mwOptions = opts + , mwMaxNestingLevel = 4 + , mwNextLinkNumber = 1 + , mwCategoryLinks = [] + , mwHeaderMap = M.empty + , mwIdentifierList = Set.empty + } + (s ++ "\n") + case parsed of + Right result -> return result + Left e -> throwError e + +data MWState = MWState { mwOptions :: ReaderOptions + , mwMaxNestingLevel :: Int + , mwNextLinkNumber :: Int + , mwCategoryLinks :: [Inlines] + , mwHeaderMap :: M.Map Inlines String + , mwIdentifierList :: Set.Set String + } + +type MWParser m = ParserT [Char] MWState m + +instance HasReaderOptions MWState where + extractReaderOptions = mwOptions + +instance HasHeaderMap MWState where + extractHeaderMap = mwHeaderMap + updateHeaderMap f st = st{ mwHeaderMap = f $ mwHeaderMap st } + +instance HasIdentifierList MWState where + extractIdentifierList = mwIdentifierList + updateIdentifierList f st = st{ mwIdentifierList = f $ mwIdentifierList st } + +-- +-- auxiliary functions +-- + +-- This is used to prevent exponential blowups for things like: +-- ''a'''a''a'''a''a'''a''a'''a +nested :: PandocMonad m => MWParser m a -> MWParser m a +nested p = do + nestlevel <- mwMaxNestingLevel `fmap` getState + guard $ nestlevel > 0 + updateState $ \st -> st{ mwMaxNestingLevel = mwMaxNestingLevel st - 1 } + res <- p + updateState $ \st -> st{ mwMaxNestingLevel = nestlevel } + return res + +specialChars :: [Char] +specialChars = "'[]<=&*{}|\":\\" + +spaceChars :: [Char] +spaceChars = " \n\t" + +sym :: PandocMonad m => String -> MWParser m () +sym s = () <$ try (string s) + +newBlockTags :: [String] +newBlockTags = ["haskell","syntaxhighlight","source","gallery","references"] + +isBlockTag' :: Tag String -> Bool +isBlockTag' tag@(TagOpen t _) = (isBlockTag tag || t `elem` newBlockTags) && + t `notElem` eitherBlockOrInline +isBlockTag' tag@(TagClose t) = (isBlockTag tag || t `elem` newBlockTags) && + t `notElem` eitherBlockOrInline +isBlockTag' tag = isBlockTag tag + +isInlineTag' :: Tag String -> Bool +isInlineTag' (TagComment _) = True +isInlineTag' t = not (isBlockTag' t) + +eitherBlockOrInline :: [String] +eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", + "map", "area", "object"] + +htmlComment :: PandocMonad m => MWParser m () +htmlComment = () <$ htmlTag isCommentTag + +inlinesInTags :: PandocMonad m => String -> MWParser m Inlines +inlinesInTags tag = try $ do + (_,raw) <- htmlTag (~== TagOpen tag []) + if '/' `elem` raw -- self-closing tag + then return mempty + else trimInlines . mconcat <$> + manyTill inline (htmlTag (~== TagClose tag)) + +blocksInTags :: PandocMonad m => String -> MWParser m Blocks +blocksInTags tag = try $ do + (_,raw) <- htmlTag (~== TagOpen tag []) + let closer = if tag == "li" + then htmlTag (~== TagClose "li") + <|> lookAhead ( + htmlTag (~== TagOpen "li" []) + <|> htmlTag (~== TagClose "ol") + <|> htmlTag (~== TagClose "ul")) + else htmlTag (~== TagClose tag) + if '/' `elem` raw -- self-closing tag + then return mempty + else mconcat <$> manyTill block closer + +charsInTags :: PandocMonad m => String -> MWParser m [Char] +charsInTags tag = try $ do + (_,raw) <- htmlTag (~== TagOpen tag []) + if '/' `elem` raw -- self-closing tag + then return "" + else manyTill anyChar (htmlTag (~== TagClose tag)) + +-- +-- main parser +-- + +parseMediaWiki :: PandocMonad m => MWParser m Pandoc +parseMediaWiki = do + bs <- mconcat <$> many block + spaces + eof + categoryLinks <- reverse . mwCategoryLinks <$> getState + let categories = if null categoryLinks + then mempty + else B.para $ mconcat $ intersperse B.space categoryLinks + return $ B.doc $ bs <> categories + +-- +-- block parsers +-- + +block :: PandocMonad m => MWParser m Blocks +block = do + pos <- getPosition + res <- mempty <$ skipMany1 blankline + <|> table + <|> header + <|> hrule + <|> orderedList + <|> bulletList + <|> definitionList + <|> mempty <$ try (spaces *> htmlComment) + <|> preformatted + <|> blockTag + <|> (B.rawBlock "mediawiki" <$> template) + <|> para + report $ ParsingTrace (take 60 $ show $ B.toList res) pos + return res + +para :: PandocMonad m => MWParser m Blocks +para = do + contents <- trimInlines . mconcat <$> many1 inline + if F.all (==Space) contents + then return mempty + else return $ B.para contents + +table :: PandocMonad m => MWParser m Blocks +table = do + tableStart + styles <- option [] parseAttrs <* blankline + let tableWidth = case lookup "width" styles of + Just w -> fromMaybe 1.0 $ parseWidth w + Nothing -> 1.0 + caption <- option mempty tableCaption + optional rowsep + hasheader <- option False $ True <$ (lookAhead (skipSpaces *> char '!')) + (cellspecs',hdr) <- unzip <$> tableRow + let widths = map ((tableWidth *) . snd) cellspecs' + let restwidth = tableWidth - sum widths + let zerocols = length $ filter (==0.0) widths + let defaultwidth = if zerocols == 0 || zerocols == length widths + then 0.0 + else restwidth / fromIntegral zerocols + let widths' = map (\w -> if w == 0 then defaultwidth else w) widths + let cellspecs = zip (map fst cellspecs') widths' + rows' <- many $ try $ rowsep *> (map snd <$> tableRow) + optional blanklines + tableEnd + let cols = length hdr + let (headers,rows) = if hasheader + then (hdr, rows') + else (replicate cols mempty, hdr:rows') + return $ B.table caption cellspecs headers rows + +parseAttrs :: PandocMonad m => MWParser m [(String,String)] +parseAttrs = many1 parseAttr + +parseAttr :: PandocMonad m => MWParser m (String, String) +parseAttr = try $ do + skipMany spaceChar + k <- many1 letter + char '=' + v <- (char '"' >> many1Till (satisfy (/='\n')) (char '"')) + <|> many1 (satisfy $ \c -> not (isSpace c) && c /= '|') + return (k,v) + +tableStart :: PandocMonad m => MWParser m () +tableStart = try $ guardColumnOne *> skipSpaces *> sym "{|" + +tableEnd :: PandocMonad m => MWParser m () +tableEnd = try $ guardColumnOne *> skipSpaces *> sym "|}" + +rowsep :: PandocMonad m => MWParser m () +rowsep = try $ guardColumnOne *> skipSpaces *> sym "|-" <* + optional parseAttr <* blanklines + +cellsep :: PandocMonad m => MWParser m () +cellsep = try $ + (guardColumnOne *> skipSpaces <* + ( (char '|' <* notFollowedBy (oneOf "-}+")) + <|> (char '!') + ) + ) + <|> (() <$ try (string "||")) + <|> (() <$ try (string "!!")) + +tableCaption :: PandocMonad m => MWParser m Inlines +tableCaption = try $ do + guardColumnOne + skipSpaces + sym "|+" + optional (try $ parseAttr *> skipSpaces *> char '|' *> skipSpaces) + (trimInlines . mconcat) <$> many (notFollowedBy (cellsep <|> rowsep) *> inline) + +tableRow :: PandocMonad m => MWParser m [((Alignment, Double), Blocks)] +tableRow = try $ skipMany htmlComment *> many tableCell + +tableCell :: PandocMonad m => MWParser m ((Alignment, Double), Blocks) +tableCell = try $ do + cellsep + skipMany spaceChar + attrs <- option [] $ try $ parseAttrs <* skipSpaces <* char '|' <* + notFollowedBy (char '|') + skipMany spaceChar + ls <- concat <$> many (notFollowedBy (cellsep <|> rowsep <|> tableEnd) *> + ((snd <$> withRaw table) <|> count 1 anyChar)) + bs <- parseFromString (mconcat <$> many block) ls + let align = case lookup "align" attrs of + Just "left" -> AlignLeft + Just "right" -> AlignRight + Just "center" -> AlignCenter + _ -> AlignDefault + let width = case lookup "width" attrs of + Just xs -> fromMaybe 0.0 $ parseWidth xs + Nothing -> 0.0 + return ((align, width), bs) + +parseWidth :: String -> Maybe Double +parseWidth s = + case reverse s of + ('%':ds) | all isDigit ds -> safeRead ('0':'.':reverse ds) + _ -> Nothing + +template :: PandocMonad m => MWParser m String +template = try $ do + string "{{" + notFollowedBy (char '{') + lookAhead $ letter <|> digit <|> char ':' + let chunk = template <|> variable <|> many1 (noneOf "{}") <|> count 1 anyChar + contents <- manyTill chunk (try $ string "}}") + return $ "{{" ++ concat contents ++ "}}" + +blockTag :: PandocMonad m => MWParser m Blocks +blockTag = do + (tag, _) <- lookAhead $ htmlTag isBlockTag' + case tag of + TagOpen "blockquote" _ -> B.blockQuote <$> blocksInTags "blockquote" + TagOpen "pre" _ -> B.codeBlock . trimCode <$> charsInTags "pre" + TagOpen "syntaxhighlight" attrs -> syntaxhighlight "syntaxhighlight" attrs + TagOpen "source" attrs -> syntaxhighlight "source" attrs + TagOpen "haskell" _ -> B.codeBlockWith ("",["haskell"],[]) . trimCode <$> + charsInTags "haskell" + TagOpen "gallery" _ -> blocksInTags "gallery" + TagOpen "p" _ -> mempty <$ htmlTag (~== tag) + TagClose "p" -> mempty <$ htmlTag (~== tag) + _ -> B.rawBlock "html" . snd <$> htmlTag (~== tag) + +trimCode :: String -> String +trimCode ('\n':xs) = stripTrailingNewlines xs +trimCode xs = stripTrailingNewlines xs + +syntaxhighlight :: PandocMonad m => String -> [Attribute String] -> MWParser m Blocks +syntaxhighlight tag attrs = try $ do + let mblang = lookup "lang" attrs + let mbstart = lookup "start" attrs + let mbline = lookup "line" attrs + let classes = maybe [] (:[]) mblang ++ maybe [] (const ["numberLines"]) mbline + let kvs = maybe [] (\x -> [("startFrom",x)]) mbstart + contents <- charsInTags tag + return $ B.codeBlockWith ("",classes,kvs) $ trimCode contents + +hrule :: PandocMonad m => MWParser m Blocks +hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline) + +guardColumnOne :: PandocMonad m => MWParser m () +guardColumnOne = getPosition >>= \pos -> guard (sourceColumn pos == 1) + +preformatted :: PandocMonad m => MWParser m Blocks +preformatted = try $ do + guardColumnOne + char ' ' + let endline' = B.linebreak <$ (try $ newline <* char ' ') + let whitespace' = B.str <$> many1 ('\160' <$ spaceChar) + let spToNbsp ' ' = '\160' + spToNbsp x = x + let nowiki' = mconcat . intersperse B.linebreak . map B.str . + lines . fromEntities . map spToNbsp <$> try + (htmlTag (~== TagOpen "nowiki" []) *> + manyTill anyChar (htmlTag (~== TagClose "nowiki"))) + let inline' = whitespace' <|> endline' <|> nowiki' + <|> (try $ notFollowedBy newline *> inline) + contents <- mconcat <$> many1 inline' + let spacesStr (Str xs) = all isSpace xs + spacesStr _ = False + if F.all spacesStr contents + then return mempty + else return $ B.para $ encode contents + +encode :: Inlines -> Inlines +encode = B.fromList . normalizeCode . B.toList . walk strToCode + where strToCode (Str s) = Code ("",[],[]) s + strToCode Space = Code ("",[],[]) " " + strToCode x = x + normalizeCode [] = [] + normalizeCode (Code a1 x : Code a2 y : zs) | a1 == a2 = + normalizeCode $ (Code a1 (x ++ y)) : zs + normalizeCode (x:xs) = x : normalizeCode xs + +header :: PandocMonad m => MWParser m Blocks +header = try $ do + guardColumnOne + eqs <- many1 (char '=') + let lev = length eqs + guard $ lev <= 6 + contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=') + attr <- registerHeader nullAttr contents + return $ B.headerWith attr lev contents + +bulletList :: PandocMonad m => MWParser m Blocks +bulletList = B.bulletList <$> + ( many1 (listItem '*') + <|> (htmlTag (~== TagOpen "ul" []) *> spaces *> many (listItem '*' <|> li) <* + optional (htmlTag (~== TagClose "ul"))) ) + +orderedList :: PandocMonad m => MWParser m Blocks +orderedList = + (B.orderedList <$> many1 (listItem '#')) + <|> try + (do (tag,_) <- htmlTag (~== TagOpen "ol" []) + spaces + items <- many (listItem '#' <|> li) + optional (htmlTag (~== TagClose "ol")) + let start = fromMaybe 1 $ safeRead $ fromAttrib "start" tag + return $ B.orderedListWith (start, DefaultStyle, DefaultDelim) items) + +definitionList :: PandocMonad m => MWParser m Blocks +definitionList = B.definitionList <$> many1 defListItem + +defListItem :: PandocMonad m => MWParser m (Inlines, [Blocks]) +defListItem = try $ do + terms <- mconcat . intersperse B.linebreak <$> many defListTerm + -- we allow dd with no dt, or dt with no dd + defs <- if B.isNull terms + then notFollowedBy + (try $ skipMany1 (char ':') >> string "<math>") *> + many1 (listItem ':') + else many (listItem ':') + return (terms, defs) + +defListTerm :: PandocMonad m => MWParser m Inlines +defListTerm = char ';' >> skipMany spaceChar >> anyLine >>= + parseFromString (trimInlines . mconcat <$> many inline) + +listStart :: PandocMonad m => Char -> MWParser m () +listStart c = char c *> notFollowedBy listStartChar + +listStartChar :: PandocMonad m => MWParser m Char +listStartChar = oneOf "*#;:" + +anyListStart :: PandocMonad m => MWParser m Char +anyListStart = char '*' + <|> char '#' + <|> char ':' + <|> char ';' + +li :: PandocMonad m => MWParser m Blocks +li = lookAhead (htmlTag (~== TagOpen "li" [])) *> + (firstParaToPlain <$> blocksInTags "li") <* spaces + +listItem :: PandocMonad m => Char -> MWParser m Blocks +listItem c = try $ do + extras <- many (try $ char c <* lookAhead listStartChar) + if null extras + then listItem' c + else do + skipMany spaceChar + first <- concat <$> manyTill listChunk newline + rest <- many + (try $ string extras *> lookAhead listStartChar *> + (concat <$> manyTill listChunk newline)) + contents <- parseFromString (many1 $ listItem' c) + (unlines (first : rest)) + case c of + '*' -> return $ B.bulletList contents + '#' -> return $ B.orderedList contents + ':' -> return $ B.definitionList [(mempty, contents)] + _ -> mzero + +-- The point of this is to handle stuff like +-- * {{cite book +-- | blah +-- | blah +-- }} +-- * next list item +-- which seems to be valid mediawiki. +listChunk :: PandocMonad m => MWParser m String +listChunk = template <|> count 1 anyChar + +listItem' :: PandocMonad m => Char -> MWParser m Blocks +listItem' c = try $ do + listStart c + skipMany spaceChar + first <- concat <$> manyTill listChunk newline + rest <- many (try $ char c *> lookAhead listStartChar *> + (concat <$> manyTill listChunk newline)) + parseFromString (firstParaToPlain . mconcat <$> many1 block) + $ unlines $ first : rest + +firstParaToPlain :: Blocks -> Blocks +firstParaToPlain contents = + case viewl (B.unMany contents) of + (Para xs) :< ys -> B.Many $ (Plain xs) <| ys + _ -> contents + +-- +-- inline parsers +-- + +inline :: PandocMonad m => MWParser m Inlines +inline = whitespace + <|> url + <|> str + <|> doubleQuotes + <|> strong + <|> emph + <|> image + <|> internalLink + <|> externalLink + <|> math + <|> inlineTag + <|> B.singleton <$> charRef + <|> inlineHtml + <|> (B.rawInline "mediawiki" <$> variable) + <|> (B.rawInline "mediawiki" <$> template) + <|> special + +str :: PandocMonad m => MWParser m Inlines +str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars) + +math :: PandocMonad m => MWParser m Inlines +math = (B.displayMath . trim <$> try (many1 (char ':') >> charsInTags "math")) + <|> (B.math . trim <$> charsInTags "math") + <|> (B.displayMath . trim <$> try (dmStart *> manyTill anyChar dmEnd)) + <|> (B.math . trim <$> try (mStart *> manyTill (satisfy (/='\n')) mEnd)) + where dmStart = string "\\[" + dmEnd = try (string "\\]") + mStart = string "\\(" + mEnd = try (string "\\)") + +variable :: PandocMonad m => MWParser m String +variable = try $ do + string "{{{" + contents <- manyTill anyChar (try $ string "}}}") + return $ "{{{" ++ contents ++ "}}}" + +inlineTag :: PandocMonad m => MWParser m Inlines +inlineTag = do + (tag, _) <- lookAhead $ htmlTag isInlineTag' + case tag of + TagOpen "ref" _ -> B.note . B.plain <$> inlinesInTags "ref" + TagOpen "nowiki" _ -> try $ do + (_,raw) <- htmlTag (~== tag) + if '/' `elem` raw + then return mempty + else B.text . fromEntities <$> + manyTill anyChar (htmlTag (~== TagClose "nowiki")) + TagOpen "br" _ -> B.linebreak <$ (htmlTag (~== TagOpen "br" []) -- will get /> too + *> optional blankline) + TagOpen "strike" _ -> B.strikeout <$> inlinesInTags "strike" + TagOpen "del" _ -> B.strikeout <$> inlinesInTags "del" + TagOpen "sub" _ -> B.subscript <$> inlinesInTags "sub" + TagOpen "sup" _ -> B.superscript <$> inlinesInTags "sup" + TagOpen "code" _ -> encode <$> inlinesInTags "code" + TagOpen "tt" _ -> encode <$> inlinesInTags "tt" + TagOpen "hask" _ -> B.codeWith ("",["haskell"],[]) <$> charsInTags "hask" + _ -> B.rawInline "html" . snd <$> htmlTag (~== tag) + +special :: PandocMonad m => MWParser m Inlines +special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag') *> + oneOf specialChars) + +inlineHtml :: PandocMonad m => MWParser m Inlines +inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag' + +whitespace :: PandocMonad m => MWParser m Inlines +whitespace = B.space <$ (skipMany1 spaceChar <|> htmlComment) + <|> B.softbreak <$ endline + +endline :: PandocMonad m => MWParser m () +endline = () <$ try (newline <* + notFollowedBy spaceChar <* + notFollowedBy newline <* + notFollowedBy' hrule <* + notFollowedBy tableStart <* + notFollowedBy' header <* + notFollowedBy anyListStart) + +imageIdentifiers :: PandocMonad m => [MWParser m ()] +imageIdentifiers = [sym (identifier ++ ":") | identifier <- identifiers] + where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier", + "Bild"] + +image :: PandocMonad m => MWParser m Inlines +image = try $ do + sym "[[" + choice imageIdentifiers + fname <- addUnderscores <$> many1 (noneOf "|]") + _ <- many imageOption + dims <- try (char '|' *> (sepBy (many digit) (char 'x')) <* string "px") + <|> return [] + _ <- many imageOption + let kvs = case dims of + w:[] -> [("width", w)] + w:(h:[]) -> [("width", w), ("height", h)] + _ -> [] + let attr = ("", [], kvs) + caption <- (B.str fname <$ sym "]]") + <|> try (char '|' *> (mconcat <$> manyTill inline (sym "]]"))) + return $ B.imageWith attr fname ("fig:" ++ stringify caption) caption + +imageOption :: PandocMonad m => MWParser m String +imageOption = try $ char '|' *> opt + where + opt = try (oneOfStrings [ "border", "thumbnail", "frameless" + , "thumb", "upright", "left", "right" + , "center", "none", "baseline", "sub" + , "super", "top", "text-top", "middle" + , "bottom", "text-bottom" ]) + <|> try (string "frame") + <|> try (oneOfStrings ["link=","alt=","page=","class="] <* many (noneOf "|]")) + +collapseUnderscores :: String -> String +collapseUnderscores [] = [] +collapseUnderscores ('_':'_':xs) = collapseUnderscores ('_':xs) +collapseUnderscores (x:xs) = x : collapseUnderscores xs + +addUnderscores :: String -> String +addUnderscores = collapseUnderscores . intercalate "_" . words + +internalLink :: PandocMonad m => MWParser m Inlines +internalLink = try $ do + sym "[[" + pagename <- unwords . words <$> many (noneOf "|]") + label <- option (B.text pagename) $ char '|' *> + ( (mconcat <$> many1 (notFollowedBy (char ']') *> inline)) + -- the "pipe trick" + -- [[Help:Contents|] -> "Contents" + <|> (return $ B.text $ drop 1 $ dropWhile (/=':') pagename) ) + sym "]]" + linktrail <- B.text <$> many letter + let link = B.link (addUnderscores pagename) "wikilink" (label <> linktrail) + if "Category:" `isPrefixOf` pagename + then do + updateState $ \st -> st{ mwCategoryLinks = link : mwCategoryLinks st } + return mempty + else return link + +externalLink :: PandocMonad m => MWParser m Inlines +externalLink = try $ do + char '[' + (_, src) <- uri + lab <- try (trimInlines . mconcat <$> + (skipMany1 spaceChar *> manyTill inline (char ']'))) + <|> do char ']' + num <- mwNextLinkNumber <$> getState + updateState $ \st -> st{ mwNextLinkNumber = num + 1 } + return $ B.str $ show num + return $ B.link src "" lab + +url :: PandocMonad m => MWParser m Inlines +url = do + (orig, src) <- uri + return $ B.link src "" (B.str orig) + +-- | Parses a list of inlines between start and end delimiters. +inlinesBetween :: (PandocMonad m, Show b) => MWParser m a -> MWParser m b -> MWParser m Inlines +inlinesBetween start end = + (trimInlines . mconcat) <$> try (start >> many1Till inner end) + where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) + innerSpace = try $ whitespace <* notFollowedBy' end + +emph :: PandocMonad m => MWParser m Inlines +emph = B.emph <$> nested (inlinesBetween start end) + where start = sym "''" >> lookAhead nonspaceChar + end = try $ notFollowedBy' (() <$ strong) >> sym "''" + +strong :: PandocMonad m => MWParser m Inlines +strong = B.strong <$> nested (inlinesBetween start end) + where start = sym "'''" >> lookAhead nonspaceChar + end = try $ sym "'''" + +doubleQuotes :: PandocMonad m => MWParser m Inlines +doubleQuotes = B.doubleQuoted <$> nested (inlinesBetween openDoubleQuote closeDoubleQuote) + where openDoubleQuote = sym "\"" >> lookAhead nonspaceChar + closeDoubleQuote = try $ sym "\"" diff --git a/src/Text/Pandoc/Readers/Native.hs b/src/Text/Pandoc/Readers/Native.hs new file mode 100644 index 000000000..1953c0c83 --- /dev/null +++ b/src/Text/Pandoc/Readers/Native.hs @@ -0,0 +1,71 @@ +{- +Copyright (C) 2011-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; Either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Native + Copyright : Copyright (C) 2011-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of a string representation of a pandoc type (@Pandoc@, +@[Block]@, @Block@, @[Inline]@, or @Inline@) to a @Pandoc@ document. +-} +module Text.Pandoc.Readers.Native ( readNative ) where + +import Text.Pandoc.Definition +import Text.Pandoc.Shared (safeRead) +import Text.Pandoc.Options (ReaderOptions) + +import Control.Monad.Except (throwError) +import Text.Pandoc.Error +import Text.Pandoc.Class + +-- | Read native formatted text and return a Pandoc document. +-- The input may be a full pandoc document, a block list, a block, +-- an inline list, or an inline. Thus, for example, +-- +-- > Str "hi" +-- +-- will be treated as if it were +-- +-- > Pandoc nullMeta [Plain [Str "hi"]] +-- +readNative :: PandocMonad m + => ReaderOptions + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> m Pandoc +readNative _ s = + case maybe (Pandoc nullMeta <$> readBlocks s) Right (safeRead s) of + Right doc -> return doc + Left _ -> throwError $ PandocParseError "couldn't read native" + +readBlocks :: String -> Either PandocError [Block] +readBlocks s = maybe ((:[]) <$> readBlock s) Right (safeRead s) + +readBlock :: String -> Either PandocError Block +readBlock s = maybe (Plain <$> readInlines s) Right (safeRead s) + +readInlines :: String -> Either PandocError [Inline] +readInlines s = maybe ((:[]) <$> readInline s) Right (safeRead s) + +readInline :: String -> Either PandocError Inline +readInline s = maybe (Left . PandocParseError $ "Could not read: " ++ s) Right (safeRead s) + diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs new file mode 100644 index 000000000..cec64895c --- /dev/null +++ b/src/Text/Pandoc/Readers/OPML.hs @@ -0,0 +1,103 @@ +{-# LANGUAGE FlexibleContexts #-} +module Text.Pandoc.Readers.OPML ( readOPML ) where +import Data.Char (toUpper) +import Text.Pandoc.Options +import Text.Pandoc.Definition +import Text.Pandoc.Builder +import Text.Pandoc.Readers.HTML (readHtml) +import Text.Pandoc.Readers.Markdown (readMarkdown) +import Text.XML.Light +import Text.HTML.TagSoup.Entity (lookupEntity) +import Data.Generics +import Control.Monad.State +import Data.Default +import Text.Pandoc.Class (PandocMonad) + +type OPML m = StateT OPMLState m + +data OPMLState = OPMLState{ + opmlSectionLevel :: Int + , opmlDocTitle :: Inlines + , opmlDocAuthors :: [Inlines] + , opmlDocDate :: Inlines + } deriving Show + +instance Default OPMLState where + def = OPMLState{ opmlSectionLevel = 0 + , opmlDocTitle = mempty + , opmlDocAuthors = [] + , opmlDocDate = mempty + } + +readOPML :: PandocMonad m => ReaderOptions -> String -> m Pandoc +readOPML _ inp = do + (bs, st') <- flip runStateT def (mapM parseBlock $ normalizeTree $ parseXML inp) + return $ + setTitle (opmlDocTitle st') $ + setAuthors (opmlDocAuthors st') $ + setDate (opmlDocDate st') $ + doc $ mconcat bs + +-- normalize input, consolidating adjacent Text and CRef elements +normalizeTree :: [Content] -> [Content] +normalizeTree = everywhere (mkT go) + where go :: [Content] -> [Content] + go (Text (CData CDataRaw _ _):xs) = xs + go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = + Text (CData CDataText (s1 ++ s2) z):xs + go (Text (CData CDataText s1 z):CRef r:xs) = + Text (CData CDataText (s1 ++ convertEntity r) z):xs + go (CRef r:Text (CData CDataText s1 z):xs) = + Text (CData CDataText (convertEntity r ++ s1) z):xs + go (CRef r1:CRef r2:xs) = + Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + go xs = xs + +convertEntity :: String -> String +convertEntity e = maybe (map toUpper e) id (lookupEntity e) + +-- convenience function to get an attribute value, defaulting to "" +attrValue :: String -> Element -> String +attrValue attr elt = + case lookupAttrBy (\x -> qName x == attr) (elAttribs elt) of + Just z -> z + Nothing -> "" + +-- exceptT :: PandocMonad m => Either PandocError a -> OPML m a +-- exceptT = either throwError return + +asHtml :: PandocMonad m => String -> OPML m Inlines +asHtml s = + (\(Pandoc _ bs) -> case bs of + [Plain ils] -> fromList ils + _ -> mempty) <$> (lift $ readHtml def s) + +asMarkdown :: PandocMonad m => String -> OPML m Blocks +asMarkdown s = (\(Pandoc _ bs) -> fromList bs) <$> (lift $ readMarkdown def s) + +getBlocks :: PandocMonad m => Element -> OPML m Blocks +getBlocks e = mconcat <$> (mapM parseBlock $ elContent e) + +parseBlock :: PandocMonad m => Content -> OPML m Blocks +parseBlock (Elem e) = + case qName (elName e) of + "ownerName" -> mempty <$ modify (\st -> + st{opmlDocAuthors = [text $ strContent e]}) + "dateModified" -> mempty <$ modify (\st -> + st{opmlDocDate = text $ strContent e}) + "title" -> mempty <$ modify (\st -> + st{opmlDocTitle = text $ strContent e}) + "outline" -> gets opmlSectionLevel >>= sect . (+1) + "?xml" -> return mempty + _ -> getBlocks e + where sect n = do headerText <- asHtml $ attrValue "text" e + noteBlocks <- asMarkdown $ attrValue "_note" e + modify $ \st -> st{ opmlSectionLevel = n } + bs <- getBlocks e + modify $ \st -> st{ opmlSectionLevel = n - 1 } + let headerText' = case map toUpper (attrValue "type" e) of + "LINK" -> link + (attrValue "url" e) "" headerText + _ -> headerText + return $ header n headerText' <> noteBlocks <> bs +parseBlock _ = return mempty diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs new file mode 100644 index 000000000..ac22f2c09 --- /dev/null +++ b/src/Text/Pandoc/Readers/Odt.hs @@ -0,0 +1,113 @@ +{-# LANGUAGE PatternGuards #-} + +{- +Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Reader.Odt + Copyright : Copyright (C) 2015 Martin Linnemann + License : GNU GPL, version 2 or above + + Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> + Stability : alpha + Portability : portable + +Entry point to the odt reader. +-} + +module Text.Pandoc.Readers.Odt ( readOdt ) where + +import Codec.Archive.Zip +import qualified Text.XML.Light as XML + +import qualified Data.ByteString.Lazy as B + +import System.FilePath + +import Control.Monad.Except (throwError) + +import Text.Pandoc.Class (PandocMonad) +import qualified Text.Pandoc.Class as P +import Text.Pandoc.Definition +import Text.Pandoc.Error +import Text.Pandoc.Options +import Text.Pandoc.MediaBag +import qualified Text.Pandoc.UTF8 as UTF8 + +import Text.Pandoc.Readers.Odt.ContentReader +import Text.Pandoc.Readers.Odt.StyleReader + +import Text.Pandoc.Readers.Odt.Generic.XMLConverter +import Text.Pandoc.Readers.Odt.Generic.Fallible +import Text.Pandoc.Shared (filteredFilesFromArchive) + +readOdt :: PandocMonad m + => ReaderOptions + -> B.ByteString + -> m Pandoc +readOdt opts bytes = case readOdt' opts bytes of + Right (doc, mb) -> do + P.setMediaBag mb + return doc + Left e -> throwError e + +-- +readOdt' :: ReaderOptions + -> B.ByteString + -> Either PandocError (Pandoc, MediaBag) +readOdt' _ bytes = bytesToOdt bytes-- of +-- Right (pandoc, mediaBag) -> Right (pandoc , mediaBag) +-- Left err -> Left err + +-- +bytesToOdt :: B.ByteString -> Either PandocError (Pandoc, MediaBag) +bytesToOdt bytes = case toArchiveOrFail bytes of + Right archive -> archiveToOdt archive + Left _ -> Left $ PandocParseError "Couldn't parse odt file." + +-- +archiveToOdt :: Archive -> Either PandocError (Pandoc, MediaBag) +archiveToOdt archive + | Just contentEntry <- findEntryByPath "content.xml" archive + , Just stylesEntry <- findEntryByPath "styles.xml" archive + , Just contentElem <- entryToXmlElem contentEntry + , Just stylesElem <- entryToXmlElem stylesEntry + , Right styles <- chooseMax (readStylesAt stylesElem ) + (readStylesAt contentElem) + , media <- filteredFilesFromArchive archive filePathIsOdtMedia + , startState <- readerState styles media + , Right pandocWithMedia <- runConverter' read_body + startState + contentElem + + = Right pandocWithMedia + + | otherwise + -- Not very detailed, but I don't think more information would be helpful + = Left $ PandocParseError "Couldn't parse odt file." + where + filePathIsOdtMedia :: FilePath -> Bool + filePathIsOdtMedia fp = + let (dir, _) = splitFileName fp + in + (dir == "Pictures/") + + +-- +entryToXmlElem :: Entry -> Maybe XML.Element +entryToXmlElem = XML.parseXMLDoc . UTF8.toStringLazy . fromEntry diff --git a/src/Text/Pandoc/Readers/Odt/Arrows/State.hs b/src/Text/Pandoc/Readers/Odt/Arrows/State.hs new file mode 100644 index 000000000..b056f1ecc --- /dev/null +++ b/src/Text/Pandoc/Readers/Odt/Arrows/State.hs @@ -0,0 +1,253 @@ +{-# LANGUAGE Arrows #-} +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE FlexibleInstances #-} +{- +Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Odt.Arrows.State + Copyright : Copyright (C) 2015 Martin Linnemann + License : GNU GPL, version 2 or above + + Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> + Stability : alpha + Portability : portable + +An arrow that transports a state. It is in essence a more powerful version of +the standard state monad. As it is such a simple extension, there are +other version out there that do exactly the same. +The implementation is duplicated, though, to add some useful features. +Most of these might be implemented without access to innards, but it's much +faster and easier to implement this way. +-} + +module Text.Pandoc.Readers.Odt.Arrows.State where + +import Prelude hiding ( foldr, foldl ) + +import qualified Control.Category as Cat +import Control.Arrow +import Control.Monad + +import Data.Foldable +import Data.Monoid + +import Text.Pandoc.Readers.Odt.Arrows.Utils +import Text.Pandoc.Readers.Odt.Generic.Fallible + + +newtype ArrowState state a b = ArrowState + { runArrowState :: (state, a) -> (state, b) } + +-- | Constructor +withState :: (state -> a -> (state, b)) -> ArrowState state a b +withState = ArrowState . uncurry + +-- | Constructor +withState' :: ((state, a) -> (state, b)) -> ArrowState state a b +withState' = ArrowState + +-- | Constructor +modifyState :: (state -> state ) -> ArrowState state a a +modifyState = ArrowState . first + +-- | Constructor +ignoringState :: ( a -> b ) -> ArrowState state a b +ignoringState = ArrowState . second + +-- | Constructor +fromState :: (state -> (state, b)) -> ArrowState state a b +fromState = ArrowState . (.fst) + +-- | Constructor +extractFromState :: (state -> b ) -> ArrowState state x b +extractFromState f = ArrowState $ \(state,_) -> (state, f state) + +-- | Constructor +withUnchangedState :: (state -> a -> b ) -> ArrowState state a b +withUnchangedState f = ArrowState $ \(state,a) -> (state, f state a) + +-- | Constructor +tryModifyState :: (state -> Either f state) + -> ArrowState state a (Either f a) +tryModifyState f = ArrowState $ \(state,a) + -> (state,).Left ||| (,Right a) $ f state + +instance Cat.Category (ArrowState s) where + id = ArrowState id + arrow2 . arrow1 = ArrowState $ (runArrowState arrow2).(runArrowState arrow1) + +instance Arrow (ArrowState state) where + arr = ignoringState + first a = ArrowState $ \(s,(aF,aS)) + -> second (,aS) $ runArrowState a (s,aF) + second a = ArrowState $ \(s,(aF,aS)) + -> second (aF,) $ runArrowState a (s,aS) + +instance ArrowChoice (ArrowState state) where + left a = ArrowState $ \(s,e) -> case e of + Left l -> second Left $ runArrowState a (s,l) + Right r -> (s, Right r) + right a = ArrowState $ \(s,e) -> case e of + Left l -> (s, Left l) + Right r -> second Right $ runArrowState a (s,r) + +instance ArrowLoop (ArrowState state) where + loop a = ArrowState $ \(s, x) + -> let (s', (x', _d)) = runArrowState a (s, (x, _d)) + in (s', x') + +instance ArrowApply (ArrowState state) where + app = ArrowState $ \(s, (f,b)) -> runArrowState f (s,b) + + +-- | Embedding of a state arrow in a state arrow with a different state type. +switchState :: (s -> s') -> (s' -> s) -> ArrowState s' x y -> ArrowState s x y +switchState there back a = ArrowState $ first there + >>> runArrowState a + >>> first back + +-- | Lift a state arrow to modify the state of an arrow +-- with a different state type. +liftToState :: (s -> s') -> ArrowState s' s s -> ArrowState s x x +liftToState unlift a = modifyState $ unlift &&& id + >>> runArrowState a + >>> snd + +-- | Switches the type of the state temporarily. +-- Drops the intermediate result state, behaving like the identity arrow, +-- save for side effects in the state. +withSubState :: ArrowState s x s2 -> ArrowState s2 s s -> ArrowState s x x +withSubState unlift a = keepingTheValue (withSubState unlift a) >>^ fst + +-- | Switches the type of the state temporarily. +-- Returns the resulting sub-state. +withSubState' :: ArrowState s x s' -> ArrowState s' s s -> ArrowState s x s' +withSubState' unlift a = ArrowState $ runArrowState unlift + >>> switch + >>> runArrowState a + >>> switch + where switch (x,y) = (y,x) + +-- | Switches the type of the state temporarily. +-- Drops the intermediate result state, behaving like a fallible +-- identity arrow, save for side effects in the state. +withSubStateF :: ArrowState s x (Either f s') + -> ArrowState s' s (Either f s ) + -> ArrowState s x (Either f x ) +withSubStateF unlift a = keepingTheValue (withSubStateF' unlift a) + >>^ spreadChoice + >>^ fmap fst + +-- | Switches the type of the state temporarily. +-- Returns the resulting sub-state. +withSubStateF' :: ArrowState s x (Either f s') + -> ArrowState s' s (Either f s ) + -> ArrowState s x (Either f s') +withSubStateF' unlift a = ArrowState go + where go p@(s,_) = tryRunning unlift + ( tryRunning a (second Right) ) + p + where tryRunning a' b v = case runArrowState a' v of + (_ , Left f) -> (s, Left f) + (x , Right y) -> b (y,x) + +-- | Fold a state arrow through something 'Foldable'. Collect the results +-- in a 'Monoid'. +-- Intermediate form of a fold between one with "only" a 'Monoid' +-- and one with any function. +foldS :: (Foldable f, Monoid m) => ArrowState s x m -> ArrowState s (f x) m +foldS a = ArrowState $ \(s,f) -> foldr a' (s,mempty) f + where a' x (s',m) = second (m <>) $ runArrowState a (s',x) + +-- | Fold a state arrow through something 'Foldable'. Collect the results +-- in a 'Monoid'. +-- Intermediate form of a fold between one with "only" a 'Monoid' +-- and one with any function. +foldSL :: (Foldable f, Monoid m) => ArrowState s x m -> ArrowState s (f x) m +foldSL a = ArrowState $ \(s,f) -> foldl a' (s,mempty) f + where a' (s',m) x = second (m <>) $ runArrowState a (s',x) + +-- | Fold a fallible state arrow through something 'Foldable'. Collect the +-- results in a 'Monoid'. +-- Intermediate form of a fold between one with "only" a 'Monoid' +-- and one with any function. +-- If the iteration fails, the state will be reset to the initial one. +foldS' :: (Foldable f, Monoid m) + => ArrowState s x (Either e m) + -> ArrowState s (f x) (Either e m) +foldS' a = ArrowState $ \(s,f) -> foldr (a' s) (s,Right mempty) f + where a' s x (s',Right m) = case runArrowState a (s',x) of + (s'',Right m') -> (s'', Right (m <> m')) + (_ ,Left e ) -> (s , Left e) + a' _ _ e = e + +-- | Fold a fallible state arrow through something 'Foldable'. Collect the +-- results in a 'Monoid'. +-- Intermediate form of a fold between one with "only" a 'Monoid' +-- and one with any function. +-- If the iteration fails, the state will be reset to the initial one. +foldSL' :: (Foldable f, Monoid m) + => ArrowState s x (Either e m) + -> ArrowState s (f x) (Either e m) +foldSL' a = ArrowState $ \(s,f) -> foldl (a' s) (s,Right mempty) f + where a' s (s',Right m) x = case runArrowState a (s',x) of + (s'',Right m') -> (s'', Right (m <> m')) + (_ ,Left e ) -> (s , Left e) + a' _ e _ = e + +-- | Fold a state arrow through something 'Foldable'. Collect the results in a +-- 'MonadPlus'. +iterateS :: (Foldable f, MonadPlus m) + => ArrowState s x y + -> ArrowState s (f x) (m y) +iterateS a = ArrowState $ \(s,f) -> foldr a' (s,mzero) f + where a' x (s',m) = second ((mplus m).return) $ runArrowState a (s',x) + +-- | Fold a state arrow through something 'Foldable'. Collect the results in a +-- 'MonadPlus'. +iterateSL :: (Foldable f, MonadPlus m) + => ArrowState s x y + -> ArrowState s (f x) (m y) +iterateSL a = ArrowState $ \(s,f) -> foldl a' (s,mzero) f + where a' (s',m) x = second ((mplus m).return) $ runArrowState a (s',x) + + +-- | Fold a fallible state arrow through something 'Foldable'. +-- Collect the results in a 'MonadPlus'. +-- If the iteration fails, the state will be reset to the initial one. +iterateS' :: (Foldable f, MonadPlus m) + => ArrowState s x (Either e y ) + -> ArrowState s (f x) (Either e (m y)) +iterateS' a = ArrowState $ \(s,f) -> foldr (a' s) (s,Right mzero) f + where a' s x (s',Right m) = case runArrowState a (s',x) of + (s'',Right m') -> (s'',Right $ mplus m $ return m') + (_ ,Left e ) -> (s ,Left e ) + a' _ _ e = e + +-- | Fold a fallible state arrow through something 'Foldable'. +-- Collect the results in a 'MonadPlus'. +-- If the iteration fails, the state will be reset to the initial one. +iterateSL' :: (Foldable f, MonadPlus m) + => ArrowState s x (Either e y ) + -> ArrowState s (f x) (Either e (m y)) +iterateSL' a = ArrowState $ \(s,f) -> foldl (a' s) (s,Right mzero) f + where a' s (s',Right m) x = case runArrowState a (s',x) of + (s'',Right m') -> (s'',Right $ mplus m $ return m') + (_ ,Left e ) -> (s ,Left e ) + a' _ e _ = e diff --git a/src/Text/Pandoc/Readers/Odt/Arrows/Utils.hs b/src/Text/Pandoc/Readers/Odt/Arrows/Utils.hs new file mode 100644 index 000000000..218a85661 --- /dev/null +++ b/src/Text/Pandoc/Readers/Odt/Arrows/Utils.hs @@ -0,0 +1,495 @@ +{- +Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Odt.Arrows.Utils + Copyright : Copyright (C) 2015 Martin Linnemann + License : GNU GPL, version 2 or above + + Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> + Stability : alpha + Portability : portable + +Utility functions for Arrows (Kleisli monads). + +Some general notes on notation: + +* "^" is meant to stand for a pure function that is lifted into an arrow +based on its usage for that purpose in "Control.Arrow". +* "?" is meant to stand for the usage of a 'FallibleArrow' or a pure function +with an equivalent return value. +* "_" stands for the dropping of a value. +-} + +-- We export everything +module Text.Pandoc.Readers.Odt.Arrows.Utils where + +import Control.Arrow +import Control.Monad ( join, MonadPlus(..) ) + +import qualified Data.Foldable as F +import Data.Monoid + +import Text.Pandoc.Readers.Odt.Generic.Fallible +import Text.Pandoc.Readers.Odt.Generic.Utils + +and2 :: (Arrow a) => a b c -> a b c' -> a b (c,c') +and2 = (&&&) + +and3 :: (Arrow a) + => a b c0->a b c1->a b c2 + -> a b (c0,c1,c2 ) +and4 :: (Arrow a) + => a b c0->a b c1->a b c2->a b c3 + -> a b (c0,c1,c2,c3 ) +and5 :: (Arrow a) + => a b c0->a b c1->a b c2->a b c3->a b c4 + -> a b (c0,c1,c2,c3,c4 ) +and6 :: (Arrow a) + => a b c0->a b c1->a b c2->a b c3->a b c4->a b c5 + -> a b (c0,c1,c2,c3,c4,c5 ) +and7 :: (Arrow a) + => a b c0->a b c1->a b c2->a b c3->a b c4->a b c5->a b c6 + -> a b (c0,c1,c2,c3,c4,c5,c6 ) +and8 :: (Arrow a) + => a b c0->a b c1->a b c2->a b c3->a b c4->a b c5->a b c6->a b c7 + -> a b (c0,c1,c2,c3,c4,c5,c6,c7) + +and3 a b c = (and2 a b ) &&& c + >>^ \((z,y ) , x) -> (z,y,x ) +and4 a b c d = (and3 a b c ) &&& d + >>^ \((z,y,x ) , w) -> (z,y,x,w ) +and5 a b c d e = (and4 a b c d ) &&& e + >>^ \((z,y,x,w ) , v) -> (z,y,x,w,v ) +and6 a b c d e f = (and5 a b c d e ) &&& f + >>^ \((z,y,x,w,v ) , u) -> (z,y,x,w,v,u ) +and7 a b c d e f g = (and6 a b c d e f ) &&& g + >>^ \((z,y,x,w,v,u ) , t) -> (z,y,x,w,v,u,t ) +and8 a b c d e f g h = (and7 a b c d e f g) &&& h + >>^ \((z,y,x,w,v,u,t) , s) -> (z,y,x,w,v,u,t,s) + +liftA2 :: (Arrow a) => (x -> y -> z) -> a b x -> a b y -> a b z +liftA2 f a b = a &&& b >>^ uncurry f + +liftA3 :: (Arrow a) => (z->y->x -> r) + -> a b z->a b y->a b x + -> a b r +liftA4 :: (Arrow a) => (z->y->x->w -> r) + -> a b z->a b y->a b x->a b w + -> a b r +liftA5 :: (Arrow a) => (z->y->x->w->v -> r) + -> a b z->a b y->a b x->a b w->a b v + -> a b r +liftA6 :: (Arrow a) => (z->y->x->w->v->u -> r) + -> a b z->a b y->a b x->a b w->a b v->a b u + -> a b r +liftA7 :: (Arrow a) => (z->y->x->w->v->u->t -> r) + -> a b z->a b y->a b x->a b w->a b v->a b u->a b t + -> a b r +liftA8 :: (Arrow a) => (z->y->x->w->v->u->t->s -> r) + -> a b z->a b y->a b x->a b w->a b v->a b u->a b t->a b s + -> a b r + +liftA3 fun a b c = and3 a b c >>^ uncurry3 fun +liftA4 fun a b c d = and4 a b c d >>^ uncurry4 fun +liftA5 fun a b c d e = and5 a b c d e >>^ uncurry5 fun +liftA6 fun a b c d e f = and6 a b c d e f >>^ uncurry6 fun +liftA7 fun a b c d e f g = and7 a b c d e f g >>^ uncurry7 fun +liftA8 fun a b c d e f g h = and8 a b c d e f g h >>^ uncurry8 fun + +liftA :: (Arrow a) => (y -> z) -> a b y -> a b z +liftA fun a = a >>^ fun + + +-- | Duplicate a value to subsequently feed it into different arrows. +-- Can almost always be replaced with '(&&&)', 'keepingTheValue', +-- or even '(|||)'. +-- Aequivalent to +-- > returnA &&& returnA +duplicate :: (Arrow a) => a b (b,b) +duplicate = arr $ join (,) + +-- | Lifts the combination of two values into an arrow. +joinOn :: (Arrow a) => (x -> y -> z) -> a (x,y) z +joinOn = arr.uncurry + +-- | Applies a function to the uncurried result-pair of an arrow-application. +-- (The %-symbol was chosen to evoke an association with pairs.) +(>>%) :: (Arrow a) => a x (b,c) -> (b -> c -> d) -> a x d +a >>% f = a >>^ uncurry f + +-- | '(>>%)' with its arguments flipped +(%<<) :: (Arrow a) => (b -> c -> d) -> a x (b,c) -> a x d +(%<<) = flip (>>%) + +-- | Precomposition with an uncurried function +(%>>) :: (Arrow a) => (b -> c -> d) -> a d r -> a (b,c) r +f %>> a = uncurry f ^>> a + +-- | Precomposition with an uncurried function (right to left variant) +(<<%) :: (Arrow a) => a d r -> (b -> c -> d) -> a (b,c) r +(<<%) = flip (%>>) + +infixr 2 >>%, %<<, %>>, <<% + + +-- | Duplicate a value and apply an arrow to the second instance. +-- Aequivalent to +-- > \a -> duplicate >>> second a +-- or +-- > \a -> returnA &&& a +keepingTheValue :: (Arrow a) => a b c -> a b (b,c) +keepingTheValue a = returnA &&& a + +-- | Duplicate a value and apply an arrow to the first instance. +-- Aequivalent to +-- > \a -> duplicate >>> first a +-- or +-- > \a -> a &&& returnA +keepingTheValue' :: (Arrow a) => a b c -> a b (c,b) +keepingTheValue' a = a &&& returnA + +-- | 'bind' from the "Maybe"-Monad lifted into an 'ArrowChoice'. +-- Actually, it's the more complex '(>=>)', because 'bind' alone does not +-- combine as nicely in arrow form. +-- The current implementation is not the most efficient one, because it can +-- not return directly if a 'Nothing' is encountered. That in turn follows +-- from the type system, as 'Nothing' has an "invisible" type parameter that +-- can not be dropped early. +-- +-- Also, there probably is a way to generalize this to other monads +-- or applicatives, but I'm leaving that as an exercise to the reader. +-- I have a feeling there is a new Arrow-typeclass to be found that is less +-- restrictive than 'ArrowApply'. If it is already out there, +-- I have not seen it yet. ('ArrowPlus' for example is not general enough.) +(>>>=) :: (ArrowChoice a) => a x (Maybe b) -> a b (Maybe c) -> a x (Maybe c) +a1 >>>= a2 = a1 >>> maybeToChoice >>> right a2 >>> choiceToMaybe >>^ join + +infixr 2 >>>= + +-- | 'mplus' Lifted into an arrow. No 'ArrowPlus' required. +-- (But still different from a true bind) +(>++<) :: (Arrow a, MonadPlus m) => a x (m b) -> a x (m b) -> a x (m b) +(>++<) = liftA2 mplus + +-- | Left-compose with a pure function +leftLift :: (ArrowChoice a) => (l -> l') -> a (Either l r) (Either l' r) +leftLift = left.arr + +-- | Right-compose with a pure function +rightLift :: (ArrowChoice a) => (r -> r') -> a (Either l r) (Either l r') +rightLift = right.arr + + +( ^+++ ) :: (ArrowChoice a) => (b -> c) -> a b' c' -> a (Either b b') (Either c c') +( +++^ ) :: (ArrowChoice a) => a b c -> (b' -> c') -> a (Either b b') (Either c c') +( ^+++^ ) :: (ArrowChoice a) => (b -> c) -> (b' -> c') -> a (Either b b') (Either c c') + +l ^+++ r = leftLift l >>> right r +l +++^ r = left l >>> rightLift r +l ^+++^ r = leftLift l >>> rightLift r + +infixr 2 ^+++, +++^, ^+++^ + +( ^||| ) :: (ArrowChoice a) => (b -> d) -> a c d -> a (Either b c) d +( |||^ ) :: (ArrowChoice a) => a b d -> (c -> d) -> a (Either b c) d +( ^|||^ ) :: (ArrowChoice a) => (b -> d) -> (c -> d) -> a (Either b c) d + +l ^||| r = arr l ||| r +l |||^ r = l ||| arr r +l ^|||^ r = arr l ||| arr r + +infixr 2 ^||| , |||^, ^|||^ + +( ^&&& ) :: (Arrow a) => (b -> c) -> a b c' -> a b (c,c') +( &&&^ ) :: (Arrow a) => a b c -> (b -> c') -> a b (c,c') +( ^&&&^ ) :: (Arrow a) => (b -> c) -> (b -> c') -> a b (c,c') + +l ^&&& r = arr l &&& r +l &&&^ r = l &&& arr r +l ^&&&^ r = arr l &&& arr r + +infixr 3 ^&&&, &&&^, ^&&&^ + +( ^*** ) :: (Arrow a) => (b -> c) -> a b' c' -> a (b,b') (c,c') +( ***^ ) :: (Arrow a) => a b c -> (b' -> c') -> a (b,b') (c,c') +( ^***^ ) :: (Arrow a) => (b -> c) -> (b' -> c') -> a (b,b') (c,c') + +l ^*** r = arr l *** r +l ***^ r = l *** arr r +l ^***^ r = arr l *** arr r + +infixr 3 ^***, ***^, ^***^ + +-- | A version of +-- +-- >>> \p -> arr (\x -> if p x the Right x else Left x) +-- +-- but with p being an arrow +choose :: (ArrowChoice a) => a b Bool -> a b (Either b b) +choose checkValue = keepingTheValue checkValue >>^ select + where select (x,True ) = Right x + select (x,False ) = Left x + +-- | Converts @Right a@ into @Just a@ and @Left _@ into @Nothing@. +choiceToMaybe :: (ArrowChoice a) => a (Either l r) (Maybe r) +choiceToMaybe = arr eitherToMaybe + +-- | Converts @Nothing@ into @Left ()@ and @Just a@ into @Right a@. +maybeToChoice :: (ArrowChoice a) => a (Maybe b) (Fallible b) +maybeToChoice = arr maybeToEither + +-- | Lifts a constant value into an arrow +returnV :: (Arrow a) => c -> a x c +returnV = arr.const + +-- | 'returnA' dropping everything +returnA_ :: (Arrow a) => a _b () +returnA_ = returnV () + +-- | Wrapper for an arrow that can be evaluated im parallel. All +-- Arrows can be evaluated in parallel, as long as they return a +-- monoid. +newtype ParallelArrow a b c = CoEval { evalParallelArrow :: a b c } + deriving (Eq, Ord, Show) + +instance (Arrow a, Monoid m) => Monoid (ParallelArrow a b m) where + mempty = CoEval $ returnV mempty + (CoEval a) `mappend` (CoEval ~b) = CoEval $ a &&& b >>% mappend + +-- | Evaluates a collection of arrows in a parallel fashion. +-- +-- This is in essence a fold of '(&&&)' over the collection, +-- so the actual execution order and parallelity depends on the +-- implementation of '(&&&)' in the arrow in question. +-- The default implementation of '(&&&)' for example keeps the +-- order as given in the collection. +-- +-- This function can be seen as a generalization of +-- 'Control.Applicative.sequenceA' to arrows or as an alternative to +-- a fold with 'Control.Applicative.WrappedArrow', which +-- substitutes the monoid with function application. +-- +coEval :: (Arrow a, F.Foldable f, Monoid m) => f (a b m) -> a b m +coEval = evalParallelArrow . (F.foldMap CoEval) + +-- | Defines Left as failure, Right as success +type FallibleArrow a input failure success = a input (Either failure success) + +type ReFallibleArrow a failure success success' + = FallibleArrow a (Either failure success) failure success' + +-- | Wrapper for fallible arrows. Fallible arrows are all arrows that return +-- an Either value where left is a faliure and right is a success value. +newtype AlternativeArrow a input failure success + = TryArrow { evalAlternativeArrow :: FallibleArrow a input failure success } + + +instance (ArrowChoice a, Monoid failure) + => Monoid (AlternativeArrow a input failure success) where + mempty = TryArrow $ returnV $ Left mempty + (TryArrow a) `mappend` (TryArrow b) + = TryArrow $ a &&& b + >>^ \(a',~b') + -> ( (\a'' -> left (mappend a'') b') ||| Right ) + a' + +-- | Evaluates a collection of fallible arrows, trying each one in succession. +-- Left values are interpreted as failures, right values as successes. +-- +-- The evaluation is stopped once an arrow succeeds. +-- Up to that point, all failures are collected in the failure-monoid. +-- Note that '()' is a monoid, and thus can serve as a failure-collector if +-- you are uninterested in the exact failures. +-- +-- This is in essence a fold of '(&&&)' over the collection, enhanced with a +-- little bit of repackaging, so the actual execution order depends on the +-- implementation of '(&&&)' in the arrow in question. +-- The default implementation of '(&&&)' for example keeps the +-- order as given in the collection. +-- +tryArrows :: (ArrowChoice a, F.Foldable f, Monoid failure) + => f (FallibleArrow a b failure success) + -> FallibleArrow a b failure success +tryArrows = evalAlternativeArrow . (F.foldMap TryArrow) + +-- +liftSuccess :: (ArrowChoice a) + => (success -> success') + -> ReFallibleArrow a failure success success' +liftSuccess = rightLift + +-- +liftAsSuccess :: (ArrowChoice a) + => a x success + -> FallibleArrow a x failure success +liftAsSuccess a = a >>^ Right + +-- +asFallibleArrow :: (ArrowChoice a) + => a x success + -> FallibleArrow a x failure success +asFallibleArrow a = a >>^ Right + +-- | Raises an error into a 'ReFallibleArrow' if the arrow is already in +-- "error mode" +liftError :: (ArrowChoice a, Monoid failure) + => failure + -> ReFallibleArrow a failure success success +liftError e = leftLift (e <>) + +-- | Raises an error into a 'FallibleArrow', droping both the arrow input +-- and any previously stored error value. +_raiseA :: (ArrowChoice a) + => failure + -> FallibleArrow a x failure success +_raiseA e = returnV (Left e) + +-- | Raises an empty error into a 'FallibleArrow', droping both the arrow input +-- and any previously stored error value. +_raiseAEmpty :: (ArrowChoice a, Monoid failure) + => FallibleArrow a x failure success +_raiseAEmpty = _raiseA mempty + +-- | Raises an error into a 'ReFallibleArrow', possibly appending the new error +-- to an existing one +raiseA :: (ArrowChoice a, Monoid failure) + => failure + -> ReFallibleArrow a failure success success +raiseA e = arr $ Left.(either (<> e) (const e)) + +-- | Raises an empty error into a 'ReFallibleArrow'. If there already is an +-- error, nothing changes. +-- (Note that this function is only aequivalent to @raiseA mempty@ iff the +-- failure monoid follows the monoid laws.) +raiseAEmpty :: (ArrowChoice a, Monoid failure) + => ReFallibleArrow a failure success success +raiseAEmpty = arr (fromRight (const mempty) >>> Left) + + +-- | Execute the second arrow if the first succeeds +(>>?) :: (ArrowChoice a) + => FallibleArrow a x failure success + -> FallibleArrow a success failure success' + -> FallibleArrow a x failure success' +a >>? b = a >>> Left ^||| b + +-- | Execute the lifted second arrow if the first succeeds +(>>?^) :: (ArrowChoice a) + => FallibleArrow a x failure success + -> (success -> success') + -> FallibleArrow a x failure success' +a >>?^ f = a >>^ Left ^|||^ Right . f + +-- | Execute the lifted second arrow if the first succeeds +(>>?^?) :: (ArrowChoice a) + => FallibleArrow a x failure success + -> (success -> Either failure success') + -> FallibleArrow a x failure success' +a >>?^? b = a >>> Left ^|||^ b + +-- | Execute the second arrow if the lifted first arrow succeeds +(^>>?) :: (ArrowChoice a) + => (x -> Either failure success) + -> FallibleArrow a success failure success' + -> FallibleArrow a x failure success' +a ^>>? b = a ^>> Left ^||| b + +-- | Execute the lifted second arrow if the lifted first arrow succeeds +(^>>?^) :: (ArrowChoice a) + => (x -> Either failure success) + -> (success -> success') + -> FallibleArrow a x failure success' +a ^>>?^ f = arr $ a >>> right f + +-- | Execute the lifted second arrow if the lifted first arrow succeeds +(^>>?^?) :: (ArrowChoice a) + => (x -> Either failure success) + -> (success -> Either failure success') + -> FallibleArrow a x failure success' +a ^>>?^? f = a ^>> Left ^|||^ f + +-- | Execute the second, non-fallible arrow if the first arrow succeeds +(>>?!) :: (ArrowChoice a) + => FallibleArrow a x failure success + -> a success success' + -> FallibleArrow a x failure success' +a >>?! f = a >>> right f + +--- +(>>?%) :: (ArrowChoice a) + => FallibleArrow a x f (b,b') + -> (b -> b' -> c) + -> FallibleArrow a x f c +a >>?% f = a >>?^ (uncurry f) + +--- +(^>>?%) :: (ArrowChoice a) + => (x -> Either f (b,b')) + -> (b -> b' -> c) + -> FallibleArrow a x f c +a ^>>?% f = arr a >>?^ (uncurry f) + +--- +(>>?%?) :: (ArrowChoice a) + => FallibleArrow a x f (b,b') + -> (b -> b' -> (Either f c)) + -> FallibleArrow a x f c +a >>?%? f = a >>?^? (uncurry f) + +infixr 1 >>?, >>?^, >>?^? +infixr 1 ^>>?, ^>>?^, ^>>?^?, >>?! +infixr 1 >>?%, ^>>?%, >>?%? + +-- | Keep values that are Right, replace Left values by a constant. +ifFailedUse :: (ArrowChoice a) => v -> a (Either f v) v +ifFailedUse v = arr $ either (const v) id + +-- | '(&&)' lifted into an arrow +(<&&>) :: (Arrow a) => a x Bool -> a x Bool -> a x Bool +(<&&>) = liftA2 (&&) + +-- | '(||)' lifted into an arrow +(<||>) :: (Arrow a) => a x Bool -> a x Bool -> a x Bool +(<||>) = liftA2 (||) + +-- | An equivalent of '(&&)' in a fallible arrow +(>&&<) :: (ArrowChoice a, Monoid f) => FallibleArrow a x f s + -> FallibleArrow a x f s' + -> FallibleArrow a x f (s,s') +(>&&<) = liftA2 chooseMin + +-- | An equivalent of '(||)' in some forms of fallible arrows +(>||<) :: (ArrowChoice a, Monoid f, Monoid s) => FallibleArrow a x f s + -> FallibleArrow a x f s + -> FallibleArrow a x f s +(>||<) = liftA2 chooseMax + +-- | An arrow version of a short-circuit (<|>) +ifFailedDo :: (ArrowChoice a) + => FallibleArrow a x f y + -> FallibleArrow a x f y + -> FallibleArrow a x f y +ifFailedDo a b = keepingTheValue a >>> repackage ^>> (b |||^ Right) + where repackage (x , Left _) = Left x + repackage (_ , Right y) = Right y + +infixr 4 <&&>, <||>, >&&<, >||< +infixr 1 `ifFailedDo` + + diff --git a/src/Text/Pandoc/Readers/Odt/Base.hs b/src/Text/Pandoc/Readers/Odt/Base.hs new file mode 100644 index 000000000..1f095bade --- /dev/null +++ b/src/Text/Pandoc/Readers/Odt/Base.hs @@ -0,0 +1,43 @@ +{-# LANGUAGE PatternGuards #-} + +{- +Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Odt.Base + Copyright : Copyright (C) 2015 Martin Linnemann + License : GNU GPL, version 2 or above + + Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> + Stability : alpha + Portability : portable + +Core types of the odt reader. +-} + +module Text.Pandoc.Readers.Odt.Base where + +import Text.Pandoc.Readers.Odt.Generic.XMLConverter +import Text.Pandoc.Readers.Odt.Namespaces + +type OdtConverterState s = XMLConverterState Namespace s + +type XMLReader s a b = FallibleXMLConverter Namespace s a b + +type XMLReaderSafe s a b = XMLConverter Namespace s a b + diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs new file mode 100644 index 000000000..a1bd8cb59 --- /dev/null +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -0,0 +1,929 @@ +{-# LANGUAGE Arrows #-} +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE PatternGuards #-} +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE RecordWildCards #-} + +{- +Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Odt.ContentReader + Copyright : Copyright (C) 2015 Martin Linnemann + License : GNU GPL, version 2 or above + + Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> + Stability : alpha + Portability : portable + +The core of the odt reader that converts odt features into Pandoc types. +-} + +module Text.Pandoc.Readers.Odt.ContentReader +( readerState +, read_body +) where + +import Control.Arrow +import Control.Applicative hiding ( liftA, liftA2, liftA3 ) + +import qualified Data.ByteString.Lazy as B +import qualified Data.Map as M +import Data.List ( find, intercalate ) +import Data.Maybe + +import qualified Text.XML.Light as XML + +import Text.Pandoc.Definition +import Text.Pandoc.Builder +import Text.Pandoc.MediaBag (insertMedia, MediaBag) +import Text.Pandoc.Shared + +import Text.Pandoc.Readers.Odt.Base +import Text.Pandoc.Readers.Odt.Namespaces +import Text.Pandoc.Readers.Odt.StyleReader + +import Text.Pandoc.Readers.Odt.Arrows.Utils +import Text.Pandoc.Readers.Odt.Generic.XMLConverter +import Text.Pandoc.Readers.Odt.Generic.Fallible +import Text.Pandoc.Readers.Odt.Generic.Utils + +import qualified Data.Set as Set + +-------------------------------------------------------------------------------- +-- State +-------------------------------------------------------------------------------- + +type Anchor = String +type Media = [(FilePath, B.ByteString)] + +data ReaderState + = ReaderState { -- | A collection of styles read somewhere else. + -- It is only queried here, not modified. + styleSet :: Styles + -- | A stack of the styles of parent elements. + -- Used to look up inherited style properties. + , styleTrace :: [Style] + -- | Keeps track of the current depth in nested lists + , currentListLevel :: ListLevel + -- | Lists may provide their own style, but they don't have + -- to. If they do not, the style of a parent list may be used + -- or even a default list style from the paragraph style. + -- This value keeps track of the closest list style there + -- currently is. + , currentListStyle :: Maybe ListStyle + -- | A map from internal anchor names to "pretty" ones. + -- The mapping is a purely cosmetic one. + , bookmarkAnchors :: M.Map Anchor Anchor + -- | A map of files / binary data from the archive + , envMedia :: Media + -- | Hold binary resources used in the document + , odtMediaBag :: MediaBag +-- , sequences +-- , trackedChangeIDs + } + deriving ( Show ) + +readerState :: Styles -> Media -> ReaderState +readerState styles media = ReaderState styles [] 0 Nothing M.empty media mempty + +-- +pushStyle' :: Style -> ReaderState -> ReaderState +pushStyle' style state = state { styleTrace = style : styleTrace state } + +-- +popStyle' :: ReaderState -> ReaderState +popStyle' state = case styleTrace state of + _:trace -> state { styleTrace = trace } + _ -> state + +-- +modifyListLevel :: (ListLevel -> ListLevel) -> (ReaderState -> ReaderState) +modifyListLevel f state = state { currentListLevel = f (currentListLevel state) } + +-- +shiftListLevel :: ListLevel -> (ReaderState -> ReaderState) +shiftListLevel diff = modifyListLevel (+ diff) + +-- +swapCurrentListStyle :: Maybe ListStyle -> ReaderState + -> (ReaderState, Maybe ListStyle) +swapCurrentListStyle mListStyle state = ( state { currentListStyle = mListStyle } + , currentListStyle state + ) + +-- +lookupPrettyAnchor :: Anchor -> ReaderState -> Maybe Anchor +lookupPrettyAnchor anchor ReaderState{..} = M.lookup anchor bookmarkAnchors + +-- +putPrettyAnchor :: Anchor -> Anchor -> ReaderState -> ReaderState +putPrettyAnchor ugly pretty state@ReaderState{..} + = state { bookmarkAnchors = M.insert ugly pretty bookmarkAnchors } + +-- +usedAnchors :: ReaderState -> [Anchor] +usedAnchors ReaderState{..} = M.elems bookmarkAnchors + +getMediaBag :: ReaderState -> MediaBag +getMediaBag ReaderState{..} = odtMediaBag + +getMediaEnv :: ReaderState -> Media +getMediaEnv ReaderState{..} = envMedia + +insertMedia' :: (FilePath, B.ByteString) -> ReaderState -> ReaderState +insertMedia' (fp, bs) state@ReaderState{..} + = state { odtMediaBag = insertMedia fp Nothing bs odtMediaBag } + +-------------------------------------------------------------------------------- +-- Reader type and associated tools +-------------------------------------------------------------------------------- + +type OdtReader a b = XMLReader ReaderState a b + +type OdtReaderSafe a b = XMLReaderSafe ReaderState a b + +-- | Extract something from the styles +fromStyles :: (a -> Styles -> b) -> OdtReaderSafe a b +fromStyles f = keepingTheValue + (getExtraState >>^ styleSet) + >>% f + +-- +getStyleByName :: OdtReader StyleName Style +getStyleByName = fromStyles lookupStyle >>^ maybeToChoice + +-- +findStyleFamily :: OdtReader Style StyleFamily +findStyleFamily = fromStyles getStyleFamily >>^ maybeToChoice + +-- +lookupListStyle :: OdtReader StyleName ListStyle +lookupListStyle = fromStyles lookupListStyleByName >>^ maybeToChoice + +-- +switchCurrentListStyle :: OdtReaderSafe (Maybe ListStyle) (Maybe ListStyle) +switchCurrentListStyle = keepingTheValue getExtraState + >>% swapCurrentListStyle + >>> first setExtraState + >>^ snd + +-- +pushStyle :: OdtReaderSafe Style Style +pushStyle = keepingTheValue ( + ( keepingTheValue getExtraState + >>% pushStyle' + ) + >>> setExtraState + ) + >>^ fst + +-- +popStyle :: OdtReaderSafe x x +popStyle = keepingTheValue ( + getExtraState + >>> arr popStyle' + >>> setExtraState + ) + >>^ fst + +-- +getCurrentListLevel :: OdtReaderSafe _x ListLevel +getCurrentListLevel = getExtraState >>^ currentListLevel + +-- +updateMediaWithResource :: OdtReaderSafe (FilePath, B.ByteString) (FilePath, B.ByteString) +updateMediaWithResource = keepingTheValue ( + (keepingTheValue getExtraState + >>% insertMedia' + ) + >>> setExtraState + ) + >>^ fst + +lookupResource :: OdtReaderSafe String (FilePath, B.ByteString) +lookupResource = proc target -> do + state <- getExtraState -< () + case lookup target (getMediaEnv state) of + Just bs -> returnV (target, bs) -<< () + Nothing -> returnV ("", B.empty) -< () + +type AnchorPrefix = String + +-- | An adaptation of 'uniqueIdent' from "Text.Pandoc.Shared" that generates a +-- unique identifier but without assuming that the id should be for a header. +-- Second argument is a list of already used identifiers. +uniqueIdentFrom :: AnchorPrefix -> [Anchor] -> Anchor +uniqueIdentFrom baseIdent usedIdents = + let numIdent n = baseIdent ++ "-" ++ show n + in if baseIdent `elem` usedIdents + then case find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int]) of + Just x -> numIdent x + Nothing -> baseIdent -- if we have more than 60,000, allow repeats + else baseIdent + +-- | First argument: basis for a new "pretty" anchor if none exists yet +-- Second argument: a key ("ugly" anchor) +-- Returns: saved "pretty" anchor or created new one +getPrettyAnchor :: OdtReaderSafe (AnchorPrefix, Anchor) Anchor +getPrettyAnchor = proc (baseIdent, uglyAnchor) -> do + state <- getExtraState -< () + case lookupPrettyAnchor uglyAnchor state of + Just prettyAnchor -> returnA -< prettyAnchor + Nothing -> do + let newPretty = uniqueIdentFrom baseIdent (usedAnchors state) + modifyExtraState (putPrettyAnchor uglyAnchor newPretty) -<< newPretty + +-- | Input: basis for a new header anchor +-- Ouput: saved new anchor +getHeaderAnchor :: OdtReaderSafe Inlines Anchor +getHeaderAnchor = proc title -> do + state <- getExtraState -< () + let anchor = uniqueIdent (toList title) (Set.fromList $ usedAnchors state) + modifyExtraState (putPrettyAnchor anchor anchor) -<< anchor + + +-------------------------------------------------------------------------------- +-- Working with styles +-------------------------------------------------------------------------------- + +-- +readStyleByName :: OdtReader _x (StyleName, Style) +readStyleByName = + findAttr NsText "style-name" >>? keepingTheValue getStyleByName >>^ liftE + where + liftE :: (StyleName, Fallible Style) -> Fallible (StyleName, Style) + liftE (name, Right v) = Right (name, v) + liftE (_, Left v) = Left v + +-- +isStyleToTrace :: OdtReader Style Bool +isStyleToTrace = findStyleFamily >>?^ (==FaText) + +-- +withNewStyle :: OdtReaderSafe x Inlines -> OdtReaderSafe x Inlines +withNewStyle a = proc x -> do + fStyle <- readStyleByName -< () + case fStyle of + Right (styleName, _) | isCodeStyle styleName -> do + inlines <- a -< x + arr inlineCode -<< inlines + Right (_, style) -> do + mFamily <- arr styleFamily -< style + fTextProps <- arr ( maybeToChoice + . textProperties + . styleProperties + ) -< style + case fTextProps of + Right textProps -> do + state <- getExtraState -< () + let triple = (state, textProps, mFamily) + modifier <- arr modifierFromStyleDiff -< triple + fShouldTrace <- isStyleToTrace -< style + case fShouldTrace of + Right shouldTrace -> do + if shouldTrace + then do + pushStyle -< style + inlines <- a -< x + popStyle -< () + arr modifier -<< inlines + else + -- In case anything goes wrong + a -< x + Left _ -> a -< x + Left _ -> a -< x + Left _ -> a -< x + where + isCodeStyle :: StyleName -> Bool + isCodeStyle "Source_Text" = True + isCodeStyle _ = False + + inlineCode :: Inlines -> Inlines + inlineCode = code . intercalate "" . map stringify . toList + +type PropertyTriple = (ReaderState, TextProperties, Maybe StyleFamily) +type InlineModifier = Inlines -> Inlines + +-- | Given data about the local style changes, calculates how to modify +-- an instance of 'Inlines' +modifierFromStyleDiff :: PropertyTriple -> InlineModifier +modifierFromStyleDiff propertyTriple = + composition $ + (getVPosModifier propertyTriple) + : map (first ($ propertyTriple) >>> ifThen_else ignore) + [ (hasEmphChanged , emph ) + , (hasChanged isStrong , strong ) + , (hasChanged strikethrough , strikeout ) + ] + where + ifThen_else else' (if',then') = if if' then then' else else' + + ignore = id :: InlineModifier + + getVPosModifier :: PropertyTriple -> InlineModifier + getVPosModifier triple@(_,textProps,_) = + let getVPos = Just . verticalPosition + in case lookupPreviousValueM getVPos triple of + Nothing -> ignore + Just oldVPos -> getVPosModifier' (oldVPos, verticalPosition textProps) + + getVPosModifier' (oldVPos , newVPos ) | oldVPos == newVPos = ignore + getVPosModifier' ( _ , VPosSub ) = subscript + getVPosModifier' ( _ , VPosSuper ) = superscript + getVPosModifier' ( _ , _ ) = ignore + + hasEmphChanged :: PropertyTriple -> Bool + hasEmphChanged = swing any [ hasChanged isEmphasised + , hasChangedM pitch + , hasChanged underline + ] + + hasChanged property triple@(_, property -> newProperty, _) = + maybe True (/=newProperty) (lookupPreviousValue property triple) + + hasChangedM property triple@(_, textProps,_) = + fromMaybe False $ (/=) <$> property textProps <*> lookupPreviousValueM property triple + + lookupPreviousValue f = lookupPreviousStyleValue ((fmap f).textProperties) + + lookupPreviousValueM f = lookupPreviousStyleValue ((f =<<).textProperties) + + lookupPreviousStyleValue f (ReaderState{..},_,mFamily) + = ( findBy f $ extendedStylePropertyChain styleTrace styleSet ) + <|> ( f =<< fmap (lookupDefaultStyle' styleSet) mFamily ) + + +type ParaModifier = Blocks -> Blocks + +_MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_ :: Int +_MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_ :: Int +_MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_ = 5 +_MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_ = 5 + +-- | Returns either 'id' or 'blockQuote' depending on the current indentation +getParaModifier :: Style -> ParaModifier +getParaModifier Style{..} | Just props <- paraProperties styleProperties + , isBlockQuote (indentation props) + (margin_left props) + = blockQuote + | otherwise + = id + where + isBlockQuote mIndent mMargin + | LengthValueMM indent <- mIndent + , indent > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_ + = True + | LengthValueMM margin <- mMargin + , margin > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_ + = True + | LengthValueMM indent <- mIndent + , LengthValueMM margin <- mMargin + = indent + margin > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_ + + | PercentValue indent <- mIndent + , indent > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_ + = True + | PercentValue margin <- mMargin + , margin > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_ + = True + | PercentValue indent <- mIndent + , PercentValue margin <- mMargin + = indent + margin > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_ + + | otherwise + = False + +-- +constructPara :: OdtReaderSafe Blocks Blocks -> OdtReaderSafe Blocks Blocks +constructPara reader = proc blocks -> do + fStyle <- readStyleByName -< blocks + case fStyle of + Left _ -> reader -< blocks + Right (styleName, _) | isTableCaptionStyle styleName -> do + blocks' <- reader -< blocks + arr tableCaptionP -< blocks' + Right (_, style) -> do + let modifier = getParaModifier style + blocks' <- reader -< blocks + arr modifier -<< blocks' + where + isTableCaptionStyle :: StyleName -> Bool + isTableCaptionStyle "Table" = True + isTableCaptionStyle _ = False + tableCaptionP b = divWith ("", ["caption"], []) b + +type ListConstructor = [Blocks] -> Blocks + +getListConstructor :: ListLevelStyle -> ListConstructor +getListConstructor ListLevelStyle{..} = + case listLevelType of + LltBullet -> bulletList + LltImage -> bulletList + LltNumbered -> let listNumberStyle = toListNumberStyle listItemFormat + listNumberDelim = toListNumberDelim listItemPrefix + listItemSuffix + in orderedListWith (listItemStart, listNumberStyle, listNumberDelim) + where + toListNumberStyle LinfNone = DefaultStyle + toListNumberStyle LinfNumber = Decimal + toListNumberStyle LinfRomanLC = LowerRoman + toListNumberStyle LinfRomanUC = UpperRoman + toListNumberStyle LinfAlphaLC = LowerAlpha + toListNumberStyle LinfAlphaUC = UpperAlpha + toListNumberStyle (LinfString _) = Example + + toListNumberDelim Nothing (Just ".") = Period + toListNumberDelim (Just "" ) (Just ".") = Period + toListNumberDelim Nothing (Just ")") = OneParen + toListNumberDelim (Just "" ) (Just ")") = OneParen + toListNumberDelim (Just "(") (Just ")") = TwoParens + toListNumberDelim _ _ = DefaultDelim + + +-- | Determines which style to use for a list, which level to use of that +-- style, and which type of list to create as a result of this information. +-- Then prepares the state for eventual child lists and constructs the list from +-- the results. +-- Two main cases are handled: The list may provide its own style or it may +-- rely on a parent list's style. I the former case the current style in the +-- state must be switched before and after the call to the child converter +-- while in the latter the child converter can be called directly. +-- If anything goes wrong, a default ordered-list-constructor is used. +constructList :: OdtReaderSafe x [Blocks] -> OdtReaderSafe x Blocks +constructList reader = proc x -> do + modifyExtraState (shiftListLevel 1) -< () + listLevel <- getCurrentListLevel -< () + fStyleName <- findAttr NsText "style-name" -< () + case fStyleName of + Right styleName -> do + fListStyle <- lookupListStyle -< styleName + case fListStyle of + Right listStyle -> do + fLLS <- arr (uncurry getListLevelStyle) -< (listLevel,listStyle) + case fLLS of + Just listLevelStyle -> do + oldListStyle <- switchCurrentListStyle -< Just listStyle + blocks <- constructListWith listLevelStyle -<< x + switchCurrentListStyle -< oldListStyle + returnA -< blocks + Nothing -> constructOrderedList -< x + Left _ -> constructOrderedList -< x + Left _ -> do + state <- getExtraState -< () + mListStyle <- arr currentListStyle -< state + case mListStyle of + Just listStyle -> do + fLLS <- arr (uncurry getListLevelStyle) -< (listLevel,listStyle) + case fLLS of + Just listLevelStyle -> constructListWith listLevelStyle -<< x + Nothing -> constructOrderedList -< x + Nothing -> constructOrderedList -< x + where + constructOrderedList = + reader + >>> modifyExtraState (shiftListLevel (-1)) + >>^ orderedList + constructListWith listLevelStyle = + reader + >>> getListConstructor listLevelStyle + ^>> modifyExtraState (shiftListLevel (-1)) + +-------------------------------------------------------------------------------- +-- Readers +-------------------------------------------------------------------------------- + +type ElementMatcher result = (Namespace, ElementName, OdtReader result result) + +type InlineMatcher = ElementMatcher Inlines + +type BlockMatcher = ElementMatcher Blocks + + +-- +matchingElement :: (Monoid e) + => Namespace -> ElementName + -> OdtReaderSafe e e + -> ElementMatcher e +matchingElement ns name reader = (ns, name, asResultAccumulator reader) + where + asResultAccumulator :: (ArrowChoice a, Monoid m) => a m m -> a m (Fallible m) + asResultAccumulator a = liftAsSuccess $ keepingTheValue a >>% (<>) + +-- +matchChildContent' :: (Monoid result) + => [ElementMatcher result] + -> OdtReaderSafe _x result +matchChildContent' ls = returnV mempty >>> matchContent' ls + +-- +matchChildContent :: (Monoid result) + => [ElementMatcher result] + -> OdtReaderSafe (result, XML.Content) result + -> OdtReaderSafe _x result +matchChildContent ls fallback = returnV mempty >>> matchContent ls fallback + + +-------------------------------------------- +-- Matchers +-------------------------------------------- + +---------------------- +-- Basics +---------------------- + +-- +-- | Open Document allows several consecutive spaces if they are marked up +read_plain_text :: OdtReaderSafe (Inlines, XML.Content) Inlines +read_plain_text = fst ^&&& read_plain_text' >>% recover + where + -- fallible version + read_plain_text' :: OdtReader (Inlines, XML.Content) Inlines + read_plain_text' = ( second ( arr extractText ) + >>^ spreadChoice >>?! second text + ) + >>?% (<>) + -- + extractText :: XML.Content -> Fallible String + extractText (XML.Text cData) = succeedWith (XML.cdData cData) + extractText _ = failEmpty + +read_text_seq :: InlineMatcher +read_text_seq = matchingElement NsText "sequence" + $ matchChildContent [] read_plain_text + + +-- specifically. I honor that, although the current implementation of '(<>)' +-- for 'Inlines' in "Text.Pandoc.Builder" will collaps them agein. +-- The rational is to be prepared for future modifications. +read_spaces :: InlineMatcher +read_spaces = matchingElement NsText "s" ( + readAttrWithDefault NsText "c" 1 -- how many spaces? + >>^ fromList.(`replicate` Space) + ) +-- +read_line_break :: InlineMatcher +read_line_break = matchingElement NsText "line-break" + $ returnV linebreak + +-- +read_span :: InlineMatcher +read_span = matchingElement NsText "span" + $ withNewStyle + $ matchChildContent [ read_span + , read_spaces + , read_line_break + , read_link + , read_note + , read_citation + , read_bookmark + , read_bookmark_start + , read_reference_start + , read_bookmark_ref + , read_reference_ref + ] read_plain_text + +-- +read_paragraph :: BlockMatcher +read_paragraph = matchingElement NsText "p" + $ constructPara + $ liftA para + $ withNewStyle + $ matchChildContent [ read_span + , read_spaces + , read_line_break + , read_link + , read_note + , read_citation + , read_bookmark + , read_bookmark_start + , read_reference_start + , read_bookmark_ref + , read_reference_ref + , read_maybe_nested_img_frame + , read_text_seq + ] read_plain_text + + +---------------------- +-- Headers +---------------------- + +-- +read_header :: BlockMatcher +read_header = matchingElement NsText "h" + $ proc blocks -> do + level <- ( readAttrWithDefault NsText "outline-level" 1 + ) -< blocks + children <- ( matchChildContent [ read_span + , read_spaces + , read_line_break + , read_link + , read_note + , read_citation + , read_bookmark + , read_bookmark_start + , read_reference_start + , read_bookmark_ref + , read_reference_ref + , read_maybe_nested_img_frame + ] read_plain_text + ) -< blocks + anchor <- getHeaderAnchor -< children + let idAttr = (anchor, [], []) -- no classes, no key-value pairs + arr (uncurry3 headerWith) -< (idAttr, level, children) + +---------------------- +-- Lists +---------------------- + +-- +read_list :: BlockMatcher +read_list = matchingElement NsText "list" +-- $ withIncreasedListLevel + $ constructList +-- $ liftA bulletList + $ matchChildContent' [ read_list_item + ] +-- +read_list_item :: ElementMatcher [Blocks] +read_list_item = matchingElement NsText "list-item" + $ liftA (compactify.(:[])) + ( matchChildContent' [ read_paragraph + , read_header + , read_list + ] + ) + + +---------------------- +-- Links +---------------------- + +read_link :: InlineMatcher +read_link = matchingElement NsText "a" + $ liftA3 link + ( findAttrWithDefault NsXLink "href" "" ) + ( findAttrWithDefault NsOffice "title" "" ) + ( matchChildContent [ read_span + , read_note + , read_citation + , read_bookmark + , read_bookmark_start + , read_reference_start + , read_bookmark_ref + , read_reference_ref + ] read_plain_text ) + + +------------------------- +-- Footnotes +------------------------- + +read_note :: InlineMatcher +read_note = matchingElement NsText "note" + $ liftA note + $ matchChildContent' [ read_note_body ] + +read_note_body :: BlockMatcher +read_note_body = matchingElement NsText "note-body" + $ matchChildContent' [ read_paragraph ] + +------------------------- +-- Citations +------------------------- + +read_citation :: InlineMatcher +read_citation = matchingElement NsText "bibliography-mark" + $ liftA2 cite + ( liftA2 makeCitation + ( findAttrWithDefault NsText "identifier" "" ) + ( readAttrWithDefault NsText "number" 0 ) + ) + ( matchChildContent [] read_plain_text ) + where + makeCitation :: String -> Int -> [Citation] + makeCitation citeId num = [Citation citeId [] [] NormalCitation num 0] + + +---------------------- +-- Tables +---------------------- + +-- +read_table :: BlockMatcher +read_table = matchingElement NsTable "table" + $ liftA simpleTable' + $ matchChildContent' [ read_table_row + ] + +-- | A simple table without a caption or headers +-- | Infers the number of headers from rows +simpleTable' :: [[Blocks]] -> Blocks +simpleTable' [] = simpleTable [] [] +simpleTable' (x : rest) = simpleTable (fmap (const defaults) x) (x : rest) + where defaults = fromList [] + +-- +read_table_row :: ElementMatcher [[Blocks]] +read_table_row = matchingElement NsTable "table-row" + $ liftA (:[]) + $ matchChildContent' [ read_table_cell + ] + +-- +read_table_cell :: ElementMatcher [Blocks] +read_table_cell = matchingElement NsTable "table-cell" + $ liftA (compactify.(:[])) + $ matchChildContent' [ read_paragraph + ] + +---------------------- +-- Images +---------------------- + +-- +read_maybe_nested_img_frame :: InlineMatcher +read_maybe_nested_img_frame = matchingElement NsDraw "frame" + $ proc blocks -> do + img <- (findChild' NsDraw "image") -< () + case img of + Just _ -> read_frame -< blocks + Nothing -> matchChildContent' [ read_frame_text_box ] -< blocks + +read_frame :: OdtReaderSafe Inlines Inlines +read_frame = + proc blocks -> do + w <- ( findAttr' NsSVG "width" ) -< () + h <- ( findAttr' NsSVG "height" ) -< () + titleNodes <- ( matchChildContent' [ read_frame_title ] ) -< blocks + src <- matchChildContent' [ read_image_src ] -< blocks + resource <- lookupResource -< src + _ <- updateMediaWithResource -< resource + alt <- (matchChildContent [] read_plain_text) -< blocks + arr (uncurry4 imageWith ) -< + (image_attributes w h, src, inlineListToIdentifier (toList titleNodes), alt) + +image_attributes :: Maybe String -> Maybe String -> Attr +image_attributes x y = + ( "", [], (dim "width" x) ++ (dim "height" y)) + where + dim _ (Just "") = [] + dim name (Just v) = [(name, v)] + dim _ Nothing = [] + +read_image_src :: (Namespace, ElementName, OdtReader Anchor Anchor) +read_image_src = matchingElement NsDraw "image" + $ proc _ -> do + imgSrc <- findAttr NsXLink "href" -< () + case imgSrc of + Right src -> returnV src -<< () + Left _ -> returnV "" -< () + +read_frame_title :: InlineMatcher +read_frame_title = matchingElement NsSVG "title" + $ (matchChildContent [] read_plain_text) + +read_frame_text_box :: InlineMatcher +read_frame_text_box = matchingElement NsDraw "text-box" + $ proc blocks -> do + paragraphs <- (matchChildContent' [ read_paragraph ]) -< blocks + arr read_img_with_caption -< toList paragraphs + +read_img_with_caption :: [Block] -> Inlines +read_img_with_caption ((Para ((Image attr alt (src,title)) : [])) : _) = + singleton (Image attr alt (src, 'f':'i':'g':':':title)) -- no text, default caption +read_img_with_caption ((Para ((Image attr _ (src,title)) : txt)) : _) = + singleton (Image attr txt (src, 'f':'i':'g':':':title) ) -- override caption with the text that follows +read_img_with_caption ( (Para (_ : xs)) : ys) = + read_img_with_caption ((Para xs) : ys) +read_img_with_caption _ = + mempty + +---------------------- +-- Internal links +---------------------- + +_ANCHOR_PREFIX_ :: String +_ANCHOR_PREFIX_ = "anchor" + +-- +readAnchorAttr :: OdtReader _x Anchor +readAnchorAttr = findAttr NsText "name" + +-- | Beware: may fail +findAnchorName :: OdtReader AnchorPrefix Anchor +findAnchorName = ( keepingTheValue readAnchorAttr + >>^ spreadChoice + ) >>?! getPrettyAnchor + + +-- +maybeAddAnchorFrom :: OdtReader Inlines AnchorPrefix + -> OdtReaderSafe Inlines Inlines +maybeAddAnchorFrom anchorReader = + keepingTheValue (anchorReader >>? findAnchorName >>?^ toAnchorElem) + >>> + proc (inlines, fAnchorElem) -> do + case fAnchorElem of + Right anchorElem -> returnA -< anchorElem + Left _ -> returnA -< inlines + where + toAnchorElem :: Anchor -> Inlines + toAnchorElem anchorID = spanWith (anchorID, [], []) mempty + -- no classes, no key-value pairs + +-- +read_bookmark :: InlineMatcher +read_bookmark = matchingElement NsText "bookmark" + $ maybeAddAnchorFrom (liftAsSuccess $ returnV _ANCHOR_PREFIX_) + +-- +read_bookmark_start :: InlineMatcher +read_bookmark_start = matchingElement NsText "bookmark-start" + $ maybeAddAnchorFrom (liftAsSuccess $ returnV _ANCHOR_PREFIX_) + +-- +read_reference_start :: InlineMatcher +read_reference_start = matchingElement NsText "reference-mark-start" + $ maybeAddAnchorFrom readAnchorAttr + +-- | Beware: may fail +findAnchorRef :: OdtReader _x Anchor +findAnchorRef = ( findAttr NsText "ref-name" + >>?^ (_ANCHOR_PREFIX_,) + ) >>?! getPrettyAnchor + + +-- +maybeInAnchorRef :: OdtReaderSafe Inlines Inlines +maybeInAnchorRef = proc inlines -> do + fRef <- findAnchorRef -< () + case fRef of + Right anchor -> + arr (toAnchorRef anchor) -<< inlines + Left _ -> returnA -< inlines + where + toAnchorRef :: Anchor -> Inlines -> Inlines + toAnchorRef anchor = link ('#':anchor) "" -- no title + +-- +read_bookmark_ref :: InlineMatcher +read_bookmark_ref = matchingElement NsText "bookmark-ref" + $ maybeInAnchorRef + <<< matchChildContent [] read_plain_text + +-- +read_reference_ref :: InlineMatcher +read_reference_ref = matchingElement NsText "reference-ref" + $ maybeInAnchorRef + <<< matchChildContent [] read_plain_text + + +---------------------- +-- Entry point +---------------------- + +--read_plain_content :: OdtReaderSafe _x Inlines +--read_plain_content = strContent >>^ text + +read_text :: OdtReaderSafe _x Pandoc +read_text = matchChildContent' [ read_header + , read_paragraph + , read_list + , read_table + ] + >>^ doc + +post_process :: Pandoc -> Pandoc +post_process (Pandoc m blocks) = + Pandoc m (post_process' blocks) + +post_process' :: [Block] -> [Block] +post_process' ((Table _ a w h r) : (Div ("", ["caption"], _) [Para inlines] ) : xs) = + (Table inlines a w h r) : ( post_process' xs ) +post_process' bs = bs + +read_body :: OdtReader _x (Pandoc, MediaBag) +read_body = executeIn NsOffice "body" + $ executeIn NsOffice "text" + $ liftAsSuccess + $ proc inlines -> do + txt <- read_text -< inlines + state <- getExtraState -< () + returnA -< (post_process txt, getMediaBag state) diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Fallible.hs b/src/Text/Pandoc/Readers/Odt/Generic/Fallible.hs new file mode 100644 index 000000000..877443543 --- /dev/null +++ b/src/Text/Pandoc/Readers/Odt/Generic/Fallible.hs @@ -0,0 +1,260 @@ +{-# LANGUAGE GeneralizedNewtypeDeriving #-} + +{- +Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Odt.Generic.Fallible + Copyright : Copyright (C) 2015 Martin Linnemann + License : GNU GPL, version 2 or above + + Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> + Stability : alpha + Portability : portable + +Data types and utilities representing failure. Most of it is based on the +"Either" type in its usual configuration (left represents failure). + +In most cases, the failure type is implied or required to be a "Monoid". + +The choice of "Either" instead of a custom type makes it easier to write +compatible instances of "ArrowChoice". +-} + +-- We export everything +module Text.Pandoc.Readers.Odt.Generic.Fallible where + +import Control.Applicative +import Control.Monad + +import qualified Data.Foldable as F +import Data.Monoid ((<>)) + +-- | Default for now. Will probably become a class at some point. +type Failure = () + +type Fallible a = Either Failure a + + +-- | False -> Left (), True -> Right () +boolToEither :: Bool -> Fallible () +boolToEither False = Left () +boolToEither True = Right () + +-- | False -> Left (), True -> Right () +boolToChoice :: Bool -> Fallible () +boolToChoice False = Left () +boolToChoice True = Right () + +-- +maybeToEither :: Maybe a -> Fallible a +maybeToEither (Just a) = Right a +maybeToEither Nothing = Left () + +-- +eitherToMaybe :: Either _l a -> Maybe a +eitherToMaybe (Left _) = Nothing +eitherToMaybe (Right a) = Just a + +-- | > untagEither === either id id +untagEither :: Either a a -> a +untagEither (Left a) = a +untagEither (Right a) = a + +-- | > fromLeft f === either f id +fromLeft :: (a -> b) -> Either a b -> b +fromLeft f (Left a) = f a +fromLeft _ (Right b) = b + +-- | > fromRight f === either id f +fromRight :: (a -> b) -> Either b a -> b +fromRight _ (Left b) = b +fromRight f (Right a) = f a + +-- | > recover a === fromLeft (const a) === either (const a) id +recover :: a -> Either _f a -> a +recover a (Left _) = a +recover _ (Right a) = a + +-- | I would love to use 'fail'. Alas, 'Monad.fail'... +failWith :: failure -> Either failure _x +failWith f = Left f + +-- +failEmpty :: (Monoid failure) => Either failure _x +failEmpty = failWith mempty + +-- +succeedWith :: a -> Either _x a +succeedWith = Right + +-- +collapseEither :: Either failure (Either failure x) + -> Either failure x +collapseEither (Left f ) = Left f +collapseEither (Right (Left f)) = Left f +collapseEither (Right (Right x)) = Right x + +-- | If either of the values represents an error, the result is a +-- (possibly combined) error. If both values represent a success, +-- both are returned. +chooseMin :: (Monoid a) => Either a b -> Either a b' -> Either a (b,b') +chooseMin = chooseMinWith (,) + +-- | If either of the values represents an error, the result is a +-- (possibly combined) error. If both values represent a success, +-- a combination is returned. +chooseMinWith :: (Monoid a) => (b -> b' -> c) + -> Either a b + -> Either a b' + -> Either a c +chooseMinWith (><) (Right a) (Right b) = Right $ a >< b +chooseMinWith _ (Left a) (Left b) = Left $ a <> b +chooseMinWith _ (Left a) _ = Left a +chooseMinWith _ _ (Left b) = Left b + +-- | If either of the values represents a non-error, the result is a +-- (possibly combined) non-error. If both values represent an error, an error +-- is returned. +chooseMax :: (Monoid a, Monoid b) => Either a b -> Either a b -> Either a b +chooseMax = chooseMaxWith (<>) + +-- | If either of the values represents a non-error, the result is a +-- (possibly combined) non-error. If both values represent an error, an error +-- is returned. +chooseMaxWith :: (Monoid a) => (b -> b -> b) + -> Either a b + -> Either a b + -> Either a b +chooseMaxWith (><) (Right a) (Right b) = Right $ a >< b +chooseMaxWith _ (Left a) (Left b) = Left $ a <> b +chooseMaxWith _ (Right a) _ = Right a +chooseMaxWith _ _ (Right b) = Right b + + +-- | Class of containers that can escalate contained 'Either's. +-- The word "Vector" is meant in the sense of a disease transmitter. +class ChoiceVector v where + spreadChoice :: v (Either f a) -> Either f (v a) + +-- Let's do a few examples first + +instance ChoiceVector Maybe where + spreadChoice (Just (Left f)) = Left f + spreadChoice (Just (Right x)) = Right (Just x) + spreadChoice Nothing = Right Nothing + +instance ChoiceVector (Either l) where + spreadChoice (Right (Left f)) = Left f + spreadChoice (Right (Right x)) = Right (Right x) + spreadChoice (Left x ) = Right (Left x) + +instance ChoiceVector ((,) a) where + spreadChoice (_, Left f) = Left f + spreadChoice (x, Right y) = Right (x,y) + -- Wasn't there a newtype somewhere with the elements flipped? + +-- +-- More instances later, first some discussion. +-- +-- I'll have to freshen up on type system details to see how (or if) to do +-- something like +-- +-- > instance (ChoiceVector a, ChoiceVector b) => ChoiceVector (a b) where +-- > : +-- +-- But maybe it would be even better to use something like +-- +-- > class ChoiceVector v v' f | v -> v' f where +-- > spreadChoice :: v -> Either f v' +-- +-- That way, more places in @v@ could spread the cheer, e.g.: +-- +-- As before: +-- -- ( a , Either f b) (a , b) f +-- > instance ChoiceVector ((,) a (Either f b)) ((,) a b) f where +-- > spreadChoice (_, Left f) = Left f +-- > spreadChoice (a, Right b) = Right (a,b) +-- +-- But also: +-- -- ( Either f a , b) (a , b) f +-- > instance ChoiceVector ((,) (Either f a) b) ((,) a b) f where +-- > spreadChoice (Right a,b) = Right (a,b) +-- > spreadChoice (Left f,_) = Left f +-- +-- And maybe even: +-- -- ( Either f a , Either f b) (a , b) f +-- > instance ChoiceVector ((,) (Either f a) (Either f b)) ((,) a b) f where +-- > spreadChoice (Right a , Right b) = Right (a,b) +-- > spreadChoice (Left f , _ ) = Left f +-- > spreadChoice ( _ , Left f) = Left f +-- +-- Of course that would lead to a lot of overlapping instances... +-- But I can't think of a different way. A selector function might help, +-- but not even a "Data.Traversable" is powerful enough for that. +-- But maybe someone has already solved all this with a lens library. +-- +-- Well, it's an interesting academic question. But for practical purposes, +-- I have more than enough right now. + +instance ChoiceVector ((,,) a b) where + spreadChoice (_,_, Left f) = Left f + spreadChoice (a,b, Right x) = Right (a,b,x) + +instance ChoiceVector ((,,,) a b c) where + spreadChoice (_,_,_, Left f) = Left f + spreadChoice (a,b,c, Right x) = Right (a,b,c,x) + +instance ChoiceVector ((,,,,) a b c d) where + spreadChoice (_,_,_,_, Left f) = Left f + spreadChoice (a,b,c,d, Right x) = Right (a,b,c,d,x) + +instance ChoiceVector (Const a) where + spreadChoice (Const c) = Right (Const c) -- need to repackage because of implicit types + +-- | Fails on the first error +instance ChoiceVector [] where + spreadChoice = sequence -- using the monad instance of Either. + -- Could be generalized to "Data.Traversable" - but why play + -- with UndecidableInstances unless this is really needed. + +-- | Wrapper for a list. While the normal list instance of 'ChoiceVector' +-- fails whenever it can, this type will never fail. +newtype SuccessList a = SuccessList { collectNonFailing :: [a] } + deriving ( Eq, Ord, Show ) + +instance ChoiceVector SuccessList where + spreadChoice = Right . SuccessList . (foldr unTagRight []) . collectNonFailing + where unTagRight (Right x) = (x:) + unTagRight _ = id + +-- | Like 'catMaybes', but for 'Either'. +collectRights :: [Either _l r] -> [r] +collectRights = collectNonFailing . untag . spreadChoice . SuccessList + where untag = fromLeft (error "Unexpected Left") + +-- | A version of 'collectRights' generalized to other containers. The +-- container must be both "reducible" and "buildable". Most general containers +-- should fullfill these requirements, but there is no single typeclass +-- (that I know of) for that. +-- Therefore, they are split between 'Foldable' and 'MonadPlus'. +-- (Note that 'Data.Traversable.Traversable' alone would not be enough, either.) +collectRightsF :: (F.Foldable c, MonadPlus c) => c (Either _l r) -> c r +collectRightsF = F.foldr unTagRight mzero + where unTagRight (Right x) = mplus $ return x + unTagRight _ = id diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs new file mode 100644 index 000000000..82ae3e20e --- /dev/null +++ b/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs @@ -0,0 +1,62 @@ +{- +Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Odt.Generic.Namespaces + Copyright : Copyright (C) 2015 Martin Linnemann + License : GNU GPL, version 2 or above + + Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> + Stability : alpha + Portability : portable + +A class containing a set of namespace identifiers. Used to convert between +typesafe Haskell namespace identifiers and unsafe "real world" namespaces. +-} + +module Text.Pandoc.Readers.Odt.Generic.Namespaces where + +import qualified Data.Map as M + +-- +type NameSpaceIRI = String + +-- +type NameSpaceIRIs nsID = M.Map nsID NameSpaceIRI + +-- +class (Eq nsID, Ord nsID) => NameSpaceID nsID where + + -- | Given a IRI, possibly update the map and return the id of the namespace. + -- May fail if the namespace is unknown and the application does not + -- allow unknown namespaces. + getNamespaceID :: NameSpaceIRI + -> NameSpaceIRIs nsID + -> Maybe (NameSpaceIRIs nsID, nsID) + -- | Given a namespace id, lookup its IRI. May be overriden for performance. + getIRI :: nsID + -> NameSpaceIRIs nsID + -> Maybe NameSpaceIRI + -- | The root element of an XML document has a namespace, too, and the + -- "XML.Light-parser" is eager to remove the corresponding namespace + -- attribute. + -- As a result, at least this root namespace must be provided. + getInitialIRImap :: NameSpaceIRIs nsID + + getIRI = M.lookup + getInitialIRImap = M.empty diff --git a/src/Text/Pandoc/Readers/Odt/Generic/SetMap.hs b/src/Text/Pandoc/Readers/Odt/Generic/SetMap.hs new file mode 100644 index 000000000..afd7d616c --- /dev/null +++ b/src/Text/Pandoc/Readers/Odt/Generic/SetMap.hs @@ -0,0 +1,48 @@ +{- +Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Odt.Generic.SetMap + Copyright : Copyright (C) 2015 Martin Linnemann + License : GNU GPL, version 2 or above + + Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> + Stability : alpha + Portability : portable + +A map of values to sets of values. +-} + +module Text.Pandoc.Readers.Odt.Generic.SetMap where + +import qualified Data.Map as M +import qualified Data.Set as S + +type SetMap k v = M.Map k (S.Set v) + +empty :: SetMap k v +empty = M.empty + +fromList :: (Ord k, Ord v) => [(k,v)] -> SetMap k v +fromList = foldr (uncurry insert) empty + +insert :: (Ord k, Ord v) => k -> v -> SetMap k v -> SetMap k v +insert key value setMap = M.insertWith S.union key (S.singleton value) setMap + +union3 :: (Ord k) => SetMap k v -> SetMap k v -> SetMap k v -> SetMap k v +union3 sm1 sm2 sm3 = sm1 `M.union` sm2 `M.union` sm3 diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs new file mode 100644 index 000000000..6c10ed61d --- /dev/null +++ b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs @@ -0,0 +1,171 @@ +{-# LANGUAGE TypeOperators #-} +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE PatternGuards #-} +{-# LANGUAGE ViewPatterns #-} + +{- +Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Reader.Odt.Generic.Utils + Copyright : Copyright (C) 2015 Martin Linnemann + License : GNU GPL, version 2 or above + + Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> + Stability : alpha + Portability : portable + +General utility functions for the odt reader. +-} + +module Text.Pandoc.Readers.Odt.Generic.Utils +( uncurry3 +, uncurry4 +, uncurry5 +, uncurry6 +, uncurry7 +, uncurry8 +, swap +, reverseComposition +, bool +, tryToRead +, Lookupable(..) +, readLookupables +, readLookupable +, readPercent +, findBy +, swing +, composition +) where + +import Control.Category ( Category, (>>>), (<<<) ) +import qualified Control.Category as Cat ( id ) +import Control.Monad ( msum ) + +import qualified Data.Foldable as F ( Foldable, foldr ) +import Data.Maybe + + +-- | Aequivalent to +-- > foldr (.) id +-- where '(.)' are 'id' are the ones from "Control.Category" +-- and 'foldr' is the one from "Data.Foldable". +-- The noun-form was chosen to be consistend with 'sum', 'product' etc +-- based on the discussion at +-- <https://groups.google.com/forum/#!topic/haskell-cafe/VkOZM1zaHOI> +-- (that I was not part of) +composition :: (Category cat, F.Foldable f) => f (cat a a) -> cat a a +composition = F.foldr (<<<) Cat.id + +-- | Aequivalent to +-- > foldr (flip (.)) id +-- where '(.)' are 'id' are the ones from "Control.Category" +-- and 'foldr' is the one from "Data.Foldable". +-- A reversed version of 'composition'. +reverseComposition :: (Category cat, F.Foldable f) => f (cat a a) -> cat a a +reverseComposition = F.foldr (>>>) Cat.id + +-- | 'Either' has 'either', 'Maybe' has 'maybe'. 'Bool' should have 'bool'. +-- Note that the first value is selected if the boolean value is 'False'. +-- That makes 'bool' consistent with the other two. Also, 'bool' now takes its +-- arguments in the exact opposite order compared to the normal if construct. +bool :: a -> a -> Bool -> a +bool x _ False = x +bool _ x True = x + +-- | This function often makes it possible to switch values with the functions +-- that are applied to them. +-- +-- Examples: +-- > swing map :: [a -> b] -> a -> [b] +-- > swing any :: [a -> Bool] -> a -> Bool +-- > swing foldr :: b -> a -> [a -> b -> b] -> b +-- > swing scanr :: c -> a -> [a -> c -> c] -> c +-- > swing zipWith :: [a -> b -> c] -> a -> [b] -> [c] +-- > swing find :: [a -> Bool] -> a -> Maybe (a -> Bool) +-- +-- Stolen from <https://wiki.haskell.org/Pointfree> +swing :: (((a -> b) -> b) -> c -> d) -> c -> a -> d +swing = flip.(.flip id) +-- swing f c a = f ($ a) c + + +-- | Alternative to 'read'/'reads'. The former of these throws errors +-- (nobody wants that) while the latter returns "to much" for simple purposes. +-- This function instead applies 'reads' and returns the first match (if any) +-- in a 'Maybe'. +tryToRead :: (Read r) => String -> Maybe r +tryToRead = reads >>> listToMaybe >>> fmap fst + +-- | A version of 'reads' that requires a '%' sign after the number +readPercent :: ReadS Int +readPercent s = [ (i,s') | (i , r ) <- reads s + , ("%" , s') <- lex r + ] + +-- | Data that can be looked up. +-- This is mostly a utility to read data with kind *. +class Lookupable a where + lookupTable :: [(String, a)] + +-- | The idea is to use this function as if there was a declaration like +-- +-- > instance (Lookupable a) => (Read a) where +-- > readsPrec _ = readLookupables +-- . +-- But including this code in this form would need UndecideableInstances. +-- That is a bad idea. Luckily 'readLookupable' (without the s at the end) +-- can be used directly in almost any case. +readLookupables :: (Lookupable a) => String -> [(a,String)] +readLookupables s = [ (a,rest) | (word,rest) <- lex s, + let result = lookup word lookupTable, + isJust result, + let Just a = result + ] + +-- | Very similar to a simple 'lookup' in the 'lookupTable', but with a lexer. +readLookupable :: (Lookupable a) => String -> Maybe a +readLookupable s = msum + $ map ((`lookup` lookupTable).fst) + $ lex s + +uncurry3 :: (a->b->c -> z) -> (a,b,c ) -> z +uncurry4 :: (a->b->c->d -> z) -> (a,b,c,d ) -> z +uncurry5 :: (a->b->c->d->e -> z) -> (a,b,c,d,e ) -> z +uncurry6 :: (a->b->c->d->e->f -> z) -> (a,b,c,d,e,f ) -> z +uncurry7 :: (a->b->c->d->e->f->g -> z) -> (a,b,c,d,e,f,g ) -> z +uncurry8 :: (a->b->c->d->e->f->g->h -> z) -> (a,b,c,d,e,f,g,h) -> z + +uncurry3 fun (a,b,c ) = fun a b c +uncurry4 fun (a,b,c,d ) = fun a b c d +uncurry5 fun (a,b,c,d,e ) = fun a b c d e +uncurry6 fun (a,b,c,d,e,f ) = fun a b c d e f +uncurry7 fun (a,b,c,d,e,f,g ) = fun a b c d e f g +uncurry8 fun (a,b,c,d,e,f,g,h) = fun a b c d e f g h + +swap :: (a,b) -> (b,a) +swap (a,b) = (b,a) + +-- | A version of "Data.List.find" that uses a converter to a Maybe instance. +-- The returned value is the first which the converter returns in a 'Just' +-- wrapper. +findBy :: (a -> Maybe b) -> [a] -> Maybe b +findBy _ [] = Nothing +findBy f ((f -> Just x):_ ) = Just x +findBy f ( _:xs) = findBy f xs + diff --git a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs new file mode 100644 index 000000000..8c03d1a09 --- /dev/null +++ b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs @@ -0,0 +1,1063 @@ +{-# LANGUAGE Arrows #-} +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE GADTs #-} +{-# LANGUAGE PatternGuards #-} +{-# LANGUAGE RecordWildCards #-} + +{- +Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Odt.Generic.XMLConverter + Copyright : Copyright (C) 2015 Martin Linnemann + License : GNU GPL, version 2 or above + + Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> + Stability : alpha + Portability : portable + +A generalized XML parser based on stateful arrows. +It might be sufficient to define this reader as a comonad, but there is +not a lot of use in trying. +-} + +module Text.Pandoc.Readers.Odt.Generic.XMLConverter +( ElementName +, XMLConverterState +, XMLConverter +, FallibleXMLConverter +, swapPosition +, runConverter +, runConverter'' +, runConverter' +, runConverterF' +, runConverterF +, getCurrentElement +, getExtraState +, setExtraState +, modifyExtraState +, convertingExtraState +, producingExtraState +, lookupNSiri +, lookupNSprefix +, readNSattributes +, elemName +, elemNameIs +, strContent +, elContent +, currentElem +, currentElemIs +, expectElement +, elChildren +, findChildren +, filterChildren +, filterChildrenName +, findChild' +, findChild +, filterChild' +, filterChild +, filterChildName' +, filterChildName +, isSet +, isSet' +, isSetWithDefault +, hasAttrValueOf' +, failIfNotAttrValueOf +, isThatTheAttrValue +, searchAttrIn +, searchAttrWith +, searchAttr +, lookupAttr +, lookupAttr' +, lookupAttrWithDefault +, lookupDefaultingAttr +, findAttr' +, findAttr +, findAttrWithDefault +, readAttr +, readAttr' +, readAttrWithDefault +, getAttr +-- , (>/<) +-- , (?>/<) +, executeIn +, collectEvery +, withEveryL +, withEvery +, tryAll +, tryAll' +, IdXMLConverter +, MaybeEConverter +, ElementMatchConverter +, MaybeCConverter +, ContentMatchConverter +, makeMatcherE +, makeMatcherC +, prepareMatchersE +, prepareMatchersC +, matchChildren +, matchContent'' +, matchContent' +, matchContent +) where + +import Control.Applicative hiding ( liftA, liftA2 ) +import Control.Monad ( MonadPlus ) +import Control.Arrow + +import qualified Data.Map as M +import qualified Data.Foldable as F +import Data.Default +import Data.Maybe + +import qualified Text.XML.Light as XML + +import Text.Pandoc.Readers.Odt.Arrows.State +import Text.Pandoc.Readers.Odt.Arrows.Utils + +import Text.Pandoc.Readers.Odt.Generic.Namespaces +import Text.Pandoc.Readers.Odt.Generic.Utils +import Text.Pandoc.Readers.Odt.Generic.Fallible + +-------------------------------------------------------------------------------- +-- Basis types for readability +-------------------------------------------------------------------------------- + +-- +type ElementName = String +type AttributeName = String +type AttributeValue = String + +-- +type NameSpacePrefix = String + +-- +type NameSpacePrefixes nsID = M.Map nsID NameSpacePrefix + +-------------------------------------------------------------------------------- +-- Main converter state +-------------------------------------------------------------------------------- + +-- GADT so some of the NameSpaceID restrictions can be deduced +data XMLConverterState nsID extraState where + XMLConverterState :: NameSpaceID nsID + => { -- | A stack of parent elements. The top element is the current one. + -- Arguably, a real Zipper would be better. But that is an + -- optimization that can be made at a later time, e.g. when + -- replacing Text.XML.Light. + parentElements :: [XML.Element] + -- | A map from internal namespace IDs to the namespace prefixes + -- used in XML elements + , namespacePrefixes :: NameSpacePrefixes nsID + -- | A map from internal namespace IDs to namespace IRIs + -- (Only necessary for matching namespace IDs and prefixes) + , namespaceIRIs :: NameSpaceIRIs nsID + -- | A place to put "something else". This feature is used heavily + -- to keep the main code cleaner. More specifically, the main reader + -- is divided into different stages. Each stage lifts something up + -- here, which the next stage can then use. This could of course be + -- generalized to a state-tree or used for the namespace IRIs. The + -- border between states and values is an imaginary one, after all. + -- But the separation as it is seems to be enough for now. + , moreState :: extraState + } + -> XMLConverterState nsID extraState + +-- +createStartState :: (NameSpaceID nsID) + => XML.Element + -> extraState + -> XMLConverterState nsID extraState +createStartState element extraState = + XMLConverterState + { parentElements = [element] + , namespacePrefixes = M.empty + , namespaceIRIs = getInitialIRImap + , moreState = extraState + } + +-- | Functor over extra state +instance Functor (XMLConverterState nsID) where + fmap f ( XMLConverterState parents prefixes iRIs extraState ) + = XMLConverterState parents prefixes iRIs (f extraState) + +-- +replaceExtraState :: extraState + -> XMLConverterState nsID _x + -> XMLConverterState nsID extraState +replaceExtraState x s + = fmap (const x) s + +-- +currentElement :: XMLConverterState nsID extraState + -> XML.Element +currentElement state = head (parentElements state) + +-- | Replace the current position by another, modifying the extra state +-- in the process +swapPosition :: (extraState -> extraState') + -> [XML.Element] + -> XMLConverterState nsID extraState + -> XMLConverterState nsID extraState' +swapPosition f stack state + = state { parentElements = stack + , moreState = f (moreState state) + } + +-- | Replace the current position by another, modifying the extra state +-- in the process +swapStack' :: XMLConverterState nsID extraState + -> [XML.Element] + -> ( XMLConverterState nsID extraState , [XML.Element] ) +swapStack' state stack + = ( state { parentElements = stack } + , parentElements state + ) + +-- +pushElement :: XML.Element + -> XMLConverterState nsID extraState + -> XMLConverterState nsID extraState +pushElement e state = state { parentElements = e:(parentElements state) } + +-- | Pop the top element from the call stack, unless it is the last one. +popElement :: XMLConverterState nsID extraState + -> Maybe (XMLConverterState nsID extraState) +popElement state + | _:es@(_:_) <- parentElements state = Just $ state { parentElements = es } + | otherwise = Nothing + +-------------------------------------------------------------------------------- +-- Main type +-------------------------------------------------------------------------------- + +-- It might be a good idea to pack the converters in a GADT +-- Downside: data instead of type +-- Upside: 'Failure' could be made a parameter as well. + +-- +type XMLConverter nsID extraState input output + = ArrowState (XMLConverterState nsID extraState ) input output + +type FallibleXMLConverter nsID extraState input output + = XMLConverter nsID extraState input (Fallible output) + +-- +runConverter :: XMLConverter nsID extraState input output + -> XMLConverterState nsID extraState + -> input + -> output +runConverter converter state input = snd $ runArrowState converter (state,input) + +-- +runConverter'' :: (NameSpaceID nsID) + => XMLConverter nsID extraState (Fallible ()) output + -> extraState + -> XML.Element + -> output +runConverter'' converter extraState element = runConverter (readNSattributes >>> converter) (createStartState element extraState) () + +runConverter' :: (NameSpaceID nsID) + => FallibleXMLConverter nsID extraState () success + -> extraState + -> XML.Element + -> Fallible success +runConverter' converter extraState element = runConverter (readNSattributes >>? converter) (createStartState element extraState) () + +-- +runConverterF' :: FallibleXMLConverter nsID extraState x y + -> XMLConverterState nsID extraState + -> Fallible x -> Fallible y +runConverterF' a s e = runConverter (returnV e >>? a) s e + +-- +runConverterF :: (NameSpaceID nsID) + => FallibleXMLConverter nsID extraState XML.Element x + -> extraState + -> Fallible XML.Element -> Fallible x +runConverterF a s = either failWith + (\e -> runConverter a (createStartState e s) e) + +-- +getCurrentElement :: XMLConverter nsID extraState x XML.Element +getCurrentElement = extractFromState currentElement + +-- +getExtraState :: XMLConverter nsID extraState x extraState +getExtraState = extractFromState moreState + +-- +setExtraState :: XMLConverter nsID extraState extraState extraState +setExtraState = withState $ \state extra + -> (replaceExtraState extra state , extra) + + +-- | Lifts a function to the extra state. +modifyExtraState :: (extraState -> extraState) + -> XMLConverter nsID extraState x x +modifyExtraState = modifyState.fmap + + +-- | First sets the extra state to the new value. Then modifies the original +-- extra state with a converter that uses the new state. Finally, the +-- intermediate state is dropped and the extra state is lifted into the +-- state as it was at the beginning of the function. +-- As a result, exactly the extra state and nothing else is changed. +-- The resulting converter even behaves like an identity converter on the +-- value level. +-- +-- (The -ing form is meant to be mnemonic in a sequence of arrows as in +-- convertingExtraState () converter >>> doOtherStuff) +-- +convertingExtraState :: extraState' + -> FallibleXMLConverter nsID extraState' extraState extraState + -> FallibleXMLConverter nsID extraState x x +convertingExtraState v a = withSubStateF setVAsExtraState modifyWithA + where + setVAsExtraState = liftAsSuccess $ extractFromState id >>^ replaceExtraState v + modifyWithA = keepingTheValue (moreState ^>> a) + >>^ spreadChoice >>?% flip replaceExtraState + +-- | First sets the extra state to the new value. Then produces a new +-- extra state with a converter that uses the new state. Finally, the +-- intermediate state is dropped and the extra state is lifted into the +-- state as it was at the beginning of the function. +-- As a result, exactly the extra state and nothing else is changed. +-- The resulting converter even behaves like an identity converter on the +-- value level. +-- +-- Aequivalent to +-- +-- > \v x a -> convertingExtraState v (returnV x >>> a) +-- +-- (The -ing form is meant to be mnemonic in a sequence of arrows as in +-- producingExtraState () () producer >>> doOtherStuff) +-- +producingExtraState :: extraState' + -> a + -> FallibleXMLConverter nsID extraState' a extraState + -> FallibleXMLConverter nsID extraState x x +producingExtraState v x a = convertingExtraState v (returnV x >>> a) + + +-------------------------------------------------------------------------------- +-- Work in namespaces +-------------------------------------------------------------------------------- + +-- | Arrow version of 'getIRI' +lookupNSiri :: (NameSpaceID nsID) + => nsID + -> XMLConverter nsID extraState x (Maybe NameSpaceIRI) +lookupNSiri nsID = extractFromState + $ \state -> getIRI nsID $ namespaceIRIs state + +-- +lookupNSprefix :: (NameSpaceID nsID) + => nsID + -> XMLConverter nsID extraState x (Maybe NameSpacePrefix) +lookupNSprefix nsID = extractFromState + $ \state -> M.lookup nsID $ namespacePrefixes state + +-- | Extracts namespace attributes from the current element and tries to +-- update the current mapping accordingly +readNSattributes :: (NameSpaceID nsID) + => FallibleXMLConverter nsID extraState x () +readNSattributes = fromState $ \state -> maybe (state, failEmpty ) + ( , succeedWith ()) + (extractNSAttrs state ) + where + extractNSAttrs :: (NameSpaceID nsID) + => XMLConverterState nsID extraState + -> Maybe (XMLConverterState nsID extraState) + extractNSAttrs startState + = foldl (\state d -> state >>= addNS d) + (Just startState) + nsAttribs + where nsAttribs = mapMaybe readNSattr (XML.elAttribs element) + element = currentElement startState + readNSattr (XML.Attr (XML.QName name _ (Just "xmlns")) iri) + = Just (name, iri) + readNSattr _ = Nothing + addNS (prefix, iri) state = fmap updateState + $ getNamespaceID iri + $ namespaceIRIs state + where updateState (iris,nsID) + = state { namespaceIRIs = iris + , namespacePrefixes = M.insert nsID prefix + $ namespacePrefixes state + } + +-------------------------------------------------------------------------------- +-- Common namespace accessors +-------------------------------------------------------------------------------- + +-- | Given a namespace id and an element name, creates a 'XML.QName' for +-- internal use +elemName :: (NameSpaceID nsID) + => nsID -> ElementName + -> XMLConverter nsID extraState x XML.QName +elemName nsID name = lookupNSiri nsID + &&& lookupNSprefix nsID + >>% XML.QName name + +-- | Checks if a given element matches both a specified namespace id +-- and a specified element name +elemNameIs :: (NameSpaceID nsID) + => nsID -> ElementName + -> XMLConverter nsID extraState XML.Element Bool +elemNameIs nsID name = keepingTheValue (lookupNSiri nsID) >>% hasThatName + where hasThatName e iri = let elName = XML.elName e + in XML.qName elName == name + && XML.qURI elName == iri + +-------------------------------------------------------------------------------- +-- General content +-------------------------------------------------------------------------------- + +-- +strContent :: XMLConverter nsID extraState x String +strContent = getCurrentElement + >>^ XML.strContent + +-- +elContent :: XMLConverter nsID extraState x [XML.Content] +elContent = getCurrentElement + >>^ XML.elContent + +-------------------------------------------------------------------------------- +-- Current element +-------------------------------------------------------------------------------- + +-- +currentElem :: XMLConverter nsID extraState x (XML.QName) +currentElem = getCurrentElement + >>^ XML.elName + +currentElemIs :: (NameSpaceID nsID) + => nsID -> ElementName + -> XMLConverter nsID extraState x Bool +currentElemIs nsID name = getCurrentElement + >>> elemNameIs nsID name + + + +{- +currentElemIs'' nsID name = ( (getCurrentElement >>^ XML.elName >>> + (XML.qName >>^ (&&).(== name) ) + ^&&&^ + (XML.qIRI >>^ (==) ) + ) >>% (.) + ) &&& lookupNSiri nsID >>% ($) +-} + +-- +expectElement :: (NameSpaceID nsID) + => nsID -> ElementName + -> FallibleXMLConverter nsID extraState x () +expectElement nsID name = currentElemIs nsID name + >>^ boolToChoice + +-------------------------------------------------------------------------------- +-- Chilren +-------------------------------------------------------------------------------- + +-- +elChildren :: XMLConverter nsID extraState x [XML.Element] +elChildren = getCurrentElement + >>^ XML.elChildren + +-- +findChildren :: (NameSpaceID nsID) + => nsID -> ElementName + -> XMLConverter nsID extraState x [XML.Element] +findChildren nsID name = elemName nsID name + &&& getCurrentElement + >>% XML.findChildren + +-- +filterChildren :: (XML.Element -> Bool) + -> XMLConverter nsID extraState x [XML.Element] +filterChildren p = getCurrentElement + >>^ XML.filterChildren p + +-- +filterChildrenName :: (XML.QName -> Bool) + -> XMLConverter nsID extraState x [XML.Element] +filterChildrenName p = getCurrentElement + >>^ XML.filterChildrenName p + +-- +findChild' :: (NameSpaceID nsID) + => nsID + -> ElementName + -> XMLConverter nsID extraState x (Maybe XML.Element) +findChild' nsID name = elemName nsID name + &&& getCurrentElement + >>% XML.findChild + +-- +findChild :: (NameSpaceID nsID) + => nsID -> ElementName + -> FallibleXMLConverter nsID extraState x XML.Element +findChild nsID name = findChild' nsID name + >>> maybeToChoice + +-- +filterChild' :: (XML.Element -> Bool) + -> XMLConverter nsID extraState x (Maybe XML.Element) +filterChild' p = getCurrentElement + >>^ XML.filterChild p + +-- +filterChild :: (XML.Element -> Bool) + -> FallibleXMLConverter nsID extraState x XML.Element +filterChild p = filterChild' p + >>> maybeToChoice + +-- +filterChildName' :: (XML.QName -> Bool) + -> XMLConverter nsID extraState x (Maybe XML.Element) +filterChildName' p = getCurrentElement + >>^ XML.filterChildName p + +-- +filterChildName :: (XML.QName -> Bool) + -> FallibleXMLConverter nsID extraState x XML.Element +filterChildName p = filterChildName' p + >>> maybeToChoice + + +-------------------------------------------------------------------------------- +-- Attributes +-------------------------------------------------------------------------------- + +-- +isSet :: (NameSpaceID nsID) + => nsID -> AttributeName + -> (Either Failure Bool) + -> FallibleXMLConverter nsID extraState x Bool +isSet nsID attrName deflt + = findAttr' nsID attrName + >>^ maybe deflt stringToBool + +-- +isSet' :: (NameSpaceID nsID) + => nsID -> AttributeName + -> XMLConverter nsID extraState x (Maybe Bool) +isSet' nsID attrName = findAttr' nsID attrName + >>^ (>>= stringToBool') + +isSetWithDefault :: (NameSpaceID nsID) + => nsID -> AttributeName + -> Bool + -> XMLConverter nsID extraState x Bool +isSetWithDefault nsID attrName def' + = isSet' nsID attrName + >>^ fromMaybe def' + +-- +hasAttrValueOf' :: (NameSpaceID nsID) + => nsID -> AttributeName + -> AttributeValue + -> XMLConverter nsID extraState x Bool +hasAttrValueOf' nsID attrName attrValue + = findAttr nsID attrName + >>> ( const False ^|||^ (==attrValue)) + +-- +failIfNotAttrValueOf :: (NameSpaceID nsID) + => nsID -> AttributeName + -> AttributeValue + -> FallibleXMLConverter nsID extraState x () +failIfNotAttrValueOf nsID attrName attrValue + = hasAttrValueOf' nsID attrName attrValue + >>^ boolToChoice + +-- | Is the value that is currently transported in the arrow the value of +-- the specified attribute? +isThatTheAttrValue :: (NameSpaceID nsID) + => nsID -> AttributeName + -> FallibleXMLConverter nsID extraState AttributeValue Bool +isThatTheAttrValue nsID attrName + = keepingTheValue + (findAttr nsID attrName) + >>% right.(==) + +-- | Lookup value in a dictionary, fail if no attribute found or value +-- not in dictionary +searchAttrIn :: (NameSpaceID nsID) + => nsID -> AttributeName + -> [(AttributeValue,a)] + -> FallibleXMLConverter nsID extraState x a +searchAttrIn nsID attrName dict + = findAttr nsID attrName + >>?^? maybeToChoice.(`lookup` dict ) + + +-- | Lookup value in a dictionary. Fail if no attribute found. If value not in +-- dictionary, return default value +searchAttrWith :: (NameSpaceID nsID) + => nsID -> AttributeName + -> a + -> [(AttributeValue,a)] + -> FallibleXMLConverter nsID extraState x a +searchAttrWith nsID attrName defV dict + = findAttr nsID attrName + >>?^ (fromMaybe defV).(`lookup` dict ) + +-- | Lookup value in a dictionary. If attribute or value not found, +-- return default value +searchAttr :: (NameSpaceID nsID) + => nsID -> AttributeName + -> a + -> [(AttributeValue,a)] + -> XMLConverter nsID extraState x a +searchAttr nsID attrName defV dict + = searchAttrIn nsID attrName dict + >>> const defV ^|||^ id + +-- | Read a 'Lookupable' attribute. Fail if no match. +lookupAttr :: (NameSpaceID nsID, Lookupable a) + => nsID -> AttributeName + -> FallibleXMLConverter nsID extraState x a +lookupAttr nsID attrName = lookupAttr' nsID attrName + >>^ maybeToChoice + + +-- | Read a 'Lookupable' attribute. Return the result as a 'Maybe'. +lookupAttr' :: (NameSpaceID nsID, Lookupable a) + => nsID -> AttributeName + -> XMLConverter nsID extraState x (Maybe a) +lookupAttr' nsID attrName + = findAttr' nsID attrName + >>^ (>>= readLookupable) + +-- | Read a 'Lookupable' attribute with explicit default +lookupAttrWithDefault :: (NameSpaceID nsID, Lookupable a) + => nsID -> AttributeName + -> a + -> XMLConverter nsID extraState x a +lookupAttrWithDefault nsID attrName deflt + = lookupAttr' nsID attrName + >>^ fromMaybe deflt + +-- | Read a 'Lookupable' attribute with implicit default +lookupDefaultingAttr :: (NameSpaceID nsID, Lookupable a, Default a) + => nsID -> AttributeName + -> XMLConverter nsID extraState x a +lookupDefaultingAttr nsID attrName + = lookupAttrWithDefault nsID attrName def + +-- | Return value as a (Maybe String) +findAttr' :: (NameSpaceID nsID) + => nsID -> AttributeName + -> XMLConverter nsID extraState x (Maybe AttributeValue) +findAttr' nsID attrName = elemName nsID attrName + &&& getCurrentElement + >>% XML.findAttr + +-- | Return value as string or fail +findAttr :: (NameSpaceID nsID) + => nsID -> AttributeName + -> FallibleXMLConverter nsID extraState x AttributeValue +findAttr nsID attrName = findAttr' nsID attrName + >>> maybeToChoice + +-- | Return value as string or return provided default value +findAttrWithDefault :: (NameSpaceID nsID) + => nsID -> AttributeName + -> AttributeValue + -> XMLConverter nsID extraState x AttributeValue +findAttrWithDefault nsID attrName deflt + = findAttr' nsID attrName + >>^ fromMaybe deflt + +-- | Read and return value or fail +readAttr :: (NameSpaceID nsID, Read attrValue) + => nsID -> AttributeName + -> FallibleXMLConverter nsID extraState x attrValue +readAttr nsID attrName = readAttr' nsID attrName + >>> maybeToChoice + +-- | Read and return value or return Nothing +readAttr' :: (NameSpaceID nsID, Read attrValue) + => nsID -> AttributeName + -> XMLConverter nsID extraState x (Maybe attrValue) +readAttr' nsID attrName = findAttr' nsID attrName + >>^ (>>= tryToRead) + +-- | Read and return value or return provided default value +readAttrWithDefault :: (NameSpaceID nsID, Read attrValue) + => nsID -> AttributeName + -> attrValue + -> XMLConverter nsID extraState x attrValue +readAttrWithDefault nsID attrName deflt + = findAttr' nsID attrName + >>^ (>>= tryToRead) + >>^ fromMaybe deflt + +-- | Read and return value or return default value from 'Default' instance +getAttr :: (NameSpaceID nsID, Read attrValue, Default attrValue) + => nsID -> AttributeName + -> XMLConverter nsID extraState x attrValue +getAttr nsID attrName = readAttrWithDefault nsID attrName def + +-------------------------------------------------------------------------------- +-- Movements +-------------------------------------------------------------------------------- + +-- +jumpThere :: XMLConverter nsID extraState XML.Element XML.Element +jumpThere = withState (\state element + -> ( pushElement element state , element ) + ) + +-- +swapStack :: XMLConverter nsID extraState [XML.Element] [XML.Element] +swapStack = withState swapStack' + +-- +jumpBack :: FallibleXMLConverter nsID extraState _x _x +jumpBack = tryModifyState (popElement >>> maybeToChoice) + +-- | Support function for "procedural" converters: jump to an element, execute +-- a converter, jump back. +-- This version is safer than 'executeThere', because it does not rely on the +-- internal stack. As a result, the converter can not move around in arbitrary +-- ways. The downside is of course that some of the environment is not +-- accessible to the converter. +switchingTheStack :: XMLConverter nsID moreState a b + -> XMLConverter nsID moreState (a, XML.Element) b +switchingTheStack a = second ( (:[]) ^>> swapStack ) + >>> first a + >>> second swapStack + >>^ fst + +-- | Support function for "procedural" converters: jumps to an element, executes +-- a converter, jumps back. +-- Make sure that the converter is well-behaved; that is it should +-- return to the exact position it started from in /every possible path/ of +-- execution, even if it "fails". If it does not, you may encounter +-- strange bugs. If you are not sure about the behaviour or want to use +-- shortcuts, you can often use 'switchingTheStack' instead. +executeThere :: FallibleXMLConverter nsID moreState a b + -> FallibleXMLConverter nsID moreState (a, XML.Element) b +executeThere a = second jumpThere + >>> fst + ^>> a + >>> jumpBack -- >>? jumpBack would not ensure the jump. + >>^ collapseEither + +-- | Do something in a sub-element, tnen come back +executeIn :: (NameSpaceID nsID) + => nsID -> ElementName + -> FallibleXMLConverter nsID extraState f s + -> FallibleXMLConverter nsID extraState f s +executeIn nsID name a = keepingTheValue + (findChild nsID name) + >>> ignoringState liftFailure + >>? switchingTheStack a + where liftFailure (_, (Left f)) = Left f + liftFailure (x, (Right e)) = Right (x, e) + +-------------------------------------------------------------------------------- +-- Iterating over children +-------------------------------------------------------------------------------- + +-- Helper converter to prepare different types of iterations. +-- It lifts the children (of a certain type) of the current element +-- into the value level and pairs each one with the current input value. +prepareIteration :: (NameSpaceID nsID) + => nsID -> ElementName + -> XMLConverter nsID extraState b [(b, XML.Element)] +prepareIteration nsID name = keepingTheValue + (findChildren nsID name) + >>% distributeValue + +-- | Applies a converter to every child element of a specific type. +-- Collects results in a 'Monoid'. +-- Fails completely if any conversion fails. +collectEvery :: (NameSpaceID nsID, Monoid m) + => nsID -> ElementName + -> FallibleXMLConverter nsID extraState a m + -> FallibleXMLConverter nsID extraState a m +collectEvery nsID name a = prepareIteration nsID name + >>> foldS' (switchingTheStack a) + +-- +withEveryL :: (NameSpaceID nsID) + => nsID -> ElementName + -> FallibleXMLConverter nsID extraState a b + -> FallibleXMLConverter nsID extraState a [b] +withEveryL = withEvery + +-- | Applies a converter to every child element of a specific type. +-- Collects results in a 'MonadPlus'. +-- Fails completely if any conversion fails. +withEvery :: (NameSpaceID nsID, MonadPlus m) + => nsID -> ElementName + -> FallibleXMLConverter nsID extraState a b + -> FallibleXMLConverter nsID extraState a (m b) +withEvery nsID name a = prepareIteration nsID name + >>> iterateS' (switchingTheStack a) + +-- | Applies a converter to every child element of a specific type. +-- Collects all successful results in a list. +tryAll :: (NameSpaceID nsID) + => nsID -> ElementName + -> FallibleXMLConverter nsID extraState b a + -> XMLConverter nsID extraState b [a] +tryAll nsID name a = prepareIteration nsID name + >>> iterateS (switchingTheStack a) + >>^ collectRights + +-- | Applies a converter to every child element of a specific type. +-- Collects all successful results. +tryAll' :: (NameSpaceID nsID, F.Foldable c, MonadPlus c) + => nsID -> ElementName + -> FallibleXMLConverter nsID extraState b a + -> XMLConverter nsID extraState b (c a) +tryAll' nsID name a = prepareIteration nsID name + >>> iterateS (switchingTheStack a) + >>^ collectRightsF + +-------------------------------------------------------------------------------- +-- Matching children +-------------------------------------------------------------------------------- + +type IdXMLConverter nsID moreState x + = XMLConverter nsID moreState x x + +type MaybeEConverter nsID moreState x + = Maybe (IdXMLConverter nsID moreState (x, XML.Element)) + +-- Chainable converter that helps deciding which converter to actually use. +type ElementMatchConverter nsID extraState x + = IdXMLConverter nsID + extraState + (MaybeEConverter nsID extraState x, XML.Element) + +type MaybeCConverter nsID moreState x + = Maybe (IdXMLConverter nsID moreState (x, XML.Content)) + +-- Chainable converter that helps deciding which converter to actually use. +type ContentMatchConverter nsID extraState x + = IdXMLConverter nsID + extraState + (MaybeCConverter nsID extraState x, XML.Content) + +-- Helper function: The @c@ is actually a converter that is to be selected by +-- matching XML elements to the first two parameters. +-- The fold used to match elements however is very simple, so to use it, +-- this function wraps the converter in another converter that unifies +-- the accumulator. Think of a lot of converters with the resulting type +-- chained together. The accumulator not only transports the element +-- unchanged to the next matcher, it also does the actual selecting by +-- combining the intermediate results with '(<|>)'. +makeMatcherE :: (NameSpaceID nsID) + => nsID -> ElementName + -> FallibleXMLConverter nsID extraState a a + -> ElementMatchConverter nsID extraState a +makeMatcherE nsID name c = ( second ( + elemNameIs nsID name + >>^ bool Nothing (Just tryC) + ) + >>% (<|>) + ) &&&^ snd + where tryC = (fst ^&&& executeThere c >>% recover) &&&^ snd + +-- Helper function: The @c@ is actually a converter that is to be selected by +-- matching XML content to the first two parameters. +-- The fold used to match elements however is very simple, so to use it, +-- this function wraps the converter in another converter that unifies +-- the accumulator. Think of a lot of converters with the resulting type +-- chained together. The accumulator not only transports the element +-- unchanged to the next matcher, it also does the actual selecting by +-- combining the intermediate results with '(<|>)'. +makeMatcherC :: (NameSpaceID nsID) + => nsID -> ElementName + -> FallibleXMLConverter nsID extraState a a + -> ContentMatchConverter nsID extraState a +makeMatcherC nsID name c = ( second ( contentToElem + >>> returnV Nothing + ||| ( elemNameIs nsID name + >>^ bool Nothing (Just cWithJump) + ) + ) + >>% (<|>) + ) &&&^ snd + where cWithJump = ( fst + ^&&& ( second contentToElem + >>> spreadChoice + ^>>? executeThere c + ) + >>% recover) + &&&^ snd + contentToElem :: FallibleXMLConverter nsID extraState XML.Content XML.Element + contentToElem = arr $ \e -> case e of + XML.Elem e' -> succeedWith e' + _ -> failEmpty + +-- Creates and chains a bunch of matchers +prepareMatchersE :: (NameSpaceID nsID) + => [(nsID, ElementName, FallibleXMLConverter nsID extraState x x)] + -> ElementMatchConverter nsID extraState x +--prepareMatchersE = foldSs . (map $ uncurry3 makeMatcherE) +prepareMatchersE = reverseComposition . (map $ uncurry3 makeMatcherE) + +-- Creates and chains a bunch of matchers +prepareMatchersC :: (NameSpaceID nsID) + => [(nsID, ElementName, FallibleXMLConverter nsID extraState x x)] + -> ContentMatchConverter nsID extraState x +--prepareMatchersC = foldSs . (map $ uncurry3 makeMatcherC) +prepareMatchersC = reverseComposition . (map $ uncurry3 makeMatcherC) + +-- | Takes a list of element-data - converter groups and +-- * Finds all children of the current element +-- * Matches each group to each child in order (at most one group per child) +-- * Filters non-matched children +-- * Chains all found converters in child-order +-- * Applies the chain to the input element +matchChildren :: (NameSpaceID nsID) + => [(nsID, ElementName, FallibleXMLConverter nsID extraState a a)] + -> XMLConverter nsID extraState a a +matchChildren lookups = let matcher = prepareMatchersE lookups + in keepingTheValue ( + elChildren + >>> map (Nothing,) + ^>> iterateSL matcher + >>^ catMaybes.map (\(m,e) -> fmap (swallowElem e) m) + -- >>> foldSs + >>> reverseComposition + ) + >>> swap + ^>> app + where + -- let the converter swallow the element and drop the element + -- in the return value + swallowElem element converter = (,element) ^>> converter >>^ fst + +-- +matchContent'' :: (NameSpaceID nsID) + => [(nsID, ElementName, FallibleXMLConverter nsID extraState a a)] + -> XMLConverter nsID extraState a a +matchContent'' lookups = let matcher = prepareMatchersC lookups + in keepingTheValue ( + elContent + >>> map (Nothing,) + ^>> iterateSL matcher + >>^ catMaybes.map (\(m,c) -> fmap (swallowContent c) m) + -- >>> foldSs + >>> reverseComposition + ) + >>> swap + ^>> app + where + -- let the converter swallow the content and drop the content + -- in the return value + swallowContent content converter = (,content) ^>> converter >>^ fst + + +-- | Takes a list of element-data - converter groups and +-- * Finds all content of the current element +-- * Matches each group to each piece of content in order +-- (at most one group per piece of content) +-- * Filters non-matched content +-- * Chains all found converters in content-order +-- * Applies the chain to the input element +matchContent' :: (NameSpaceID nsID) + => [(nsID, ElementName, FallibleXMLConverter nsID extraState a a)] + -> XMLConverter nsID extraState a a +matchContent' lookups = matchContent lookups (arr fst) + +-- | Takes a list of element-data - converter groups and +-- * Finds all content of the current element +-- * Matches each group to each piece of content in order +-- (at most one group per piece of content) +-- * Adds a default converter for all non-matched content +-- * Chains all found converters in content-order +-- * Applies the chain to the input element +matchContent :: (NameSpaceID nsID) + => [(nsID, ElementName, FallibleXMLConverter nsID extraState a a)] + -> XMLConverter nsID extraState (a,XML.Content) a + -> XMLConverter nsID extraState a a +matchContent lookups fallback + = let matcher = prepareMatchersC lookups + in keepingTheValue ( + elContent + >>> map (Nothing,) + ^>> iterateSL matcher + >>^ map swallowOrFallback + -- >>> foldSs + >>> reverseComposition + ) + >>> swap + ^>> app + where + -- let the converter swallow the content and drop the content + -- in the return value + swallowOrFallback (Just converter,content) = (,content) ^>> converter >>^ fst + swallowOrFallback (Nothing ,content) = (,content) ^>> fallback + +-------------------------------------------------------------------------------- +-- Internals +-------------------------------------------------------------------------------- + +stringToBool :: (Monoid failure) => String -> Either failure Bool +stringToBool val -- stringToBool' val >>> maybeToChoice + | val `elem` trueValues = succeedWith True + | val `elem` falseValues = succeedWith False + | otherwise = failEmpty + where trueValues = ["true" ,"on" ,"1"] + falseValues = ["false","off","0"] + +stringToBool' :: String -> Maybe Bool +stringToBool' val | val `elem` trueValues = Just True + | val `elem` falseValues = Just False + | otherwise = Nothing + where trueValues = ["true" ,"on" ,"1"] + falseValues = ["false","off","0"] + + +distributeValue :: a -> [b] -> [(a,b)] +distributeValue = map.(,) + +-------------------------------------------------------------------------------- + +{- +NOTES +It might be a good idea to refactor the namespace stuff. +E.g.: if a namespace constructor took a string as a parameter, things like +> a ?>/< (NsText,"body") +would be nicer. +Together with a rename and some trickery, something like +> |< NsText "body" >< NsText "p" ?> a </> </>| +might even be possible. + +Some day, XML.Light should be replaced by something better. +While doing that, it might be useful to replace String as the type of element +names with something else, too. (Of course with OverloadedStrings). +While doing that, maybe the types can be created in a way that something like +> NsText:"body" +could be used. Overloading (:) does not sounds like the best idea, but if the +element name type was a list, this might be possible. +Of course that would be a bit hackish, so the "right" way would probably be +something like +> InNS NsText "body" +but isn't that a bit boring? ;) +-} diff --git a/src/Text/Pandoc/Readers/Odt/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Namespaces.hs new file mode 100644 index 000000000..deb009998 --- /dev/null +++ b/src/Text/Pandoc/Readers/Odt/Namespaces.hs @@ -0,0 +1,110 @@ +{- +Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Reader.Odt.Namespaces + Copyright : Copyright (C) 2015 Martin Linnemann + License : GNU GPL, version 2 or above + + Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> + Stability : alpha + Portability : portable + +Namespaces used in odt files. +-} + +module Text.Pandoc.Readers.Odt.Namespaces ( Namespace (..) + ) where + +import Data.List ( isPrefixOf ) +import Data.Maybe ( fromMaybe, listToMaybe ) +import qualified Data.Map as M ( empty, insert ) + +import Text.Pandoc.Readers.Odt.Generic.Namespaces + + +instance NameSpaceID Namespace where + + getInitialIRImap = nsIDmap + + getNamespaceID "" m = Just(m, NsXML) + getNamespaceID iri m = asPair $ fromMaybe (NsOther iri) (findID iri) + where asPair nsID = Just (M.insert nsID iri m, nsID) + + +findID :: NameSpaceIRI -> Maybe Namespace +findID iri = listToMaybe [nsID | (iri',~nsID) <- nsIDs, iri' `isPrefixOf` iri] + +nsIDmap :: NameSpaceIRIs Namespace +nsIDmap = foldr (uncurry $ flip M.insert) M.empty nsIDs + +data Namespace = -- Open Document core + NsOffice | NsStyle | NsText | NsTable | NsForm + | NsDraw | Ns3D | NsAnim | NsChart | NsConfig + | NsDB | NsMeta | NsNumber | NsScript | NsManifest + | NsPresentation + -- Metadata + | NsODF + -- Compatible elements + | NsXSL_FO | NsSVG | NsSmil + -- External standards + | NsMathML | NsXForms | NsXLink | NsXHtml | NsGRDDL + | NsDublinCore + -- Metadata manifest + | NsPKG + -- Others + | NsOpenFormula + -- Core XML (basically only for the 'id'-attribute) + | NsXML + -- Fallback + | NsOther String + deriving ( Eq, Ord, Show ) + +-- | Not the actual iri's, but large prefixes of them - this way there are +-- less versioning problems and the like. +nsIDs :: [(String,Namespace)] +nsIDs = [ + ("urn:oasis:names:tc:opendocument:xmlns:animation" , NsAnim ), + ("urn:oasis:names:tc:opendocument:xmlns:chart" , NsChart ), + ("urn:oasis:names:tc:opendocument:xmlns:config" , NsConfig ), + ("urn:oasis:names:tc:opendocument:xmlns:database" , NsDB ), + ("urn:oasis:names:tc:opendocument:xmlns:dr3d" , Ns3D ), + ("urn:oasis:names:tc:opendocument:xmlns:drawing" , NsDraw ), + ("urn:oasis:names:tc:opendocument:xmlns:form" , NsForm ), + ("urn:oasis:names:tc:opendocument:xmlns:manifest" , NsManifest ), + ("urn:oasis:names:tc:opendocument:xmlns:meta" , NsMeta ), + ("urn:oasis:names:tc:opendocument:xmlns:datastyle" , NsNumber ), + ("urn:oasis:names:tc:opendocument:xmlns:of" , NsOpenFormula ), + ("urn:oasis:names:tc:opendocument:xmlns:office:1.0" , NsOffice ), + ("urn:oasis:names:tc:opendocument:xmlns:presentation" , NsPresentation ), + ("urn:oasis:names:tc:opendocument:xmlns:script" , NsScript ), + ("urn:oasis:names:tc:opendocument:xmlns:style" , NsStyle ), + ("urn:oasis:names:tc:opendocument:xmlns:table" , NsTable ), + ("urn:oasis:names:tc:opendocument:xmlns:text" , NsText ), + ("urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible", NsXSL_FO ), + ("urn:oasis:names:tc:opendocument:xmlns:smil-compatible" , NsSmil ), + ("urn:oasis:names:tc:opendocument:xmlns:svg-compatible" , NsSVG ), + ("http://docs.oasis-open.org/ns/office/1.2/meta/odf" , NsODF ), + ("http://docs.oasis-open.org/ns/office/1.2/meta/pkg" , NsPKG ), + ("http://purl.org/dc/elements" , NsDublinCore ), + ("http://www.w3.org/2003/g/data-view" , NsGRDDL ), + ("http://www.w3.org/1998/Math/MathML" , NsMathML ), + ("http://www.w3.org/1999/xhtml" , NsXHtml ), + ("http://www.w3.org/2002/xforms" , NsXForms ), + ("http://www.w3.org/1999/xlink" , NsXLink ) + ] diff --git a/src/Text/Pandoc/Readers/Odt/StyleReader.hs b/src/Text/Pandoc/Readers/Odt/StyleReader.hs new file mode 100644 index 000000000..26ba6df82 --- /dev/null +++ b/src/Text/Pandoc/Readers/Odt/StyleReader.hs @@ -0,0 +1,744 @@ +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE PatternGuards #-} +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE Arrows #-} + +{- +Copyright (C) 2015 Martin Linnemann <theCodingMarlin@googlemail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Odt.StyleReader + Copyright : Copyright (C) 2015 Martin Linnemann + License : GNU GPL, version 2 or above + + Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> + Stability : alpha + Portability : portable + +Reader for the style information in an odt document. +-} + +module Text.Pandoc.Readers.Odt.StyleReader +( Style (..) +, StyleName +, StyleFamily (..) +, Styles (..) +, StyleProperties (..) +, TextProperties (..) +, ParaProperties (..) +, VerticalTextPosition (..) +, ListItemNumberFormat (..) +, ListLevel +, ListStyle (..) +, ListLevelStyle (..) +, ListLevelType (..) +, LengthOrPercent (..) +, lookupStyle +, getTextProperty +, getTextProperty' +, getParaProperty +, getListStyle +, getListLevelStyle +, getStyleFamily +, lookupDefaultStyle +, lookupDefaultStyle' +, lookupListStyleByName +, getPropertyChain +, textPropertyChain +, stylePropertyChain +, stylePropertyChain' +, getStylePropertyChain +, extendedStylePropertyChain +, extendedStylePropertyChain' +, liftStyles +, readStylesAt +) where + +import Control.Arrow +import Control.Applicative hiding ( liftA, liftA2, liftA3 ) + +import qualified Data.Foldable as F +import qualified Data.Map as M +import qualified Data.Set as S +import Data.Char ( isDigit ) +import Data.Default +import Data.List ( unfoldr ) +import Data.Maybe + +import qualified Text.XML.Light as XML + +import Text.Pandoc.Readers.Odt.Arrows.State +import Text.Pandoc.Readers.Odt.Arrows.Utils + +import Text.Pandoc.Readers.Odt.Generic.Utils +import qualified Text.Pandoc.Readers.Odt.Generic.SetMap as SM +import Text.Pandoc.Readers.Odt.Generic.Fallible +import Text.Pandoc.Readers.Odt.Generic.XMLConverter + +import Text.Pandoc.Readers.Odt.Namespaces +import Text.Pandoc.Readers.Odt.Base + + +readStylesAt :: XML.Element -> Fallible Styles +readStylesAt e = runConverter' readAllStyles mempty e + +-------------------------------------------------------------------------------- +-- Reader for font declarations and font pitches +-------------------------------------------------------------------------------- + +-- Pandoc has no support for different font pitches. Yet knowing them can be +-- very helpful in cases where Pandoc has more semantics than OpenDocument. +-- In these cases, the pitch can help deciding as what to define a block of +-- text. So let's start with a type for font pitches: + +data FontPitch = PitchVariable | PitchFixed + deriving ( Eq, Show ) + +instance Lookupable FontPitch where + lookupTable = [ ("variable" , PitchVariable) + , ("fixed" , PitchFixed ) + ] + +instance Default FontPitch where + def = PitchVariable + +-- The font pitch can be specifed in a style directly. Normally, however, +-- it is defined in the font. That is also the specs' recommendation. +-- +-- Thus, we want + +type FontFaceName = String + +type FontPitches = M.Map FontFaceName FontPitch + +-- To get there, the fonts have to be read and the pitches extracted. +-- But the resulting map are only needed at one later place, so it should not be +-- transported on the value level, especially as we already use a state arrow. +-- So instead, the resulting map is lifted into the state of the reader. +-- (An alternative might be ImplicitParams, but again, we already have a state.) +-- +-- So the main style readers will have the types +type StyleReader a b = XMLReader FontPitches a b +-- and +type StyleReaderSafe a b = XMLReaderSafe FontPitches a b +-- respectively. +-- +-- But before we can work with these, we need to define the reader that reads +-- the fonts: + +-- | A reader for font pitches +fontPitchReader :: XMLReader _s _x FontPitches +fontPitchReader = executeIn NsOffice "font-face-decls" ( + ( withEveryL NsStyle "font-face" $ liftAsSuccess ( + findAttr' NsStyle "name" + &&& + lookupDefaultingAttr NsStyle "font-pitch" + ) + ) + >>?^ ( M.fromList . (foldl accumLegalPitches []) ) + ) + where accumLegalPitches ls (Nothing,_) = ls + accumLegalPitches ls (Just n,p) = (n,p):ls + + +-- | A wrapper around the font pitch reader that lifts the result into the +-- state. +readFontPitches :: StyleReader x x +readFontPitches = producingExtraState () () fontPitchReader + + +-- | Looking up a pitch in the state of the arrow. +-- +-- The function does the following: +-- * Look for the font pitch in an attribute. +-- * If that fails, look for the font name, look up the font in the state +-- and use the pitch from there. +-- * Return the result in a Maybe +-- +findPitch :: XMLReaderSafe FontPitches _x (Maybe FontPitch) +findPitch = ( lookupAttr NsStyle "font-pitch" + `ifFailedDo` findAttr NsStyle "font-name" + >>? ( keepingTheValue getExtraState + >>% M.lookup + >>^ maybeToChoice + ) + ) + >>> choiceToMaybe + +-------------------------------------------------------------------------------- +-- Definitions of main data +-------------------------------------------------------------------------------- + +type StyleName = String + +-- | There are two types of styles: named styles with a style family and an +-- optional style parent, and default styles for each style family, +-- defining default style properties +data Styles = Styles + { stylesByName :: M.Map StyleName Style + , listStylesByName :: M.Map StyleName ListStyle + , defaultStyleMap :: M.Map StyleFamily StyleProperties + } + deriving ( Show ) + +-- Styles from a monoid under union +instance Monoid Styles where + mempty = Styles M.empty M.empty M.empty + mappend (Styles sBn1 dSm1 lsBn1) + (Styles sBn2 dSm2 lsBn2) + = Styles (M.union sBn1 sBn2) + (M.union dSm1 dSm2) + (M.union lsBn1 lsBn2) + +-- Not all families from the specifications are implemented, only those we need. +-- But there are none that are not mentioned here. +data StyleFamily = FaText | FaParagraph +-- | FaTable | FaTableCell | FaTableColumn | FaTableRow +-- | FaGraphic | FaDrawing | FaChart +-- | FaPresentation +-- | FaRuby + deriving ( Eq, Ord, Show ) + +instance Lookupable StyleFamily where + lookupTable = [ ( "text" , FaText ) + , ( "paragraph" , FaParagraph ) +-- , ( "table" , FaTable ) +-- , ( "table-cell" , FaTableCell ) +-- , ( "table-column" , FaTableColumn ) +-- , ( "table-row" , FaTableRow ) +-- , ( "graphic" , FaGraphic ) +-- , ( "drawing-page" , FaDrawing ) +-- , ( "chart" , FaChart ) +-- , ( "presentation" , FaPresentation ) +-- , ( "ruby" , FaRuby ) + ] + +-- | A named style +data Style = Style { styleFamily :: Maybe StyleFamily + , styleParentName :: Maybe StyleName + , listStyle :: Maybe StyleName + , styleProperties :: StyleProperties + } + deriving ( Eq, Show ) + +data StyleProperties = SProps { textProperties :: Maybe TextProperties + , paraProperties :: Maybe ParaProperties +-- , tableColProperties :: Maybe TColProperties +-- , tableRowProperties :: Maybe TRowProperties +-- , tableCellProperties :: Maybe TCellProperties +-- , tableProperties :: Maybe TableProperties +-- , graphicProperties :: Maybe GraphProperties + } + deriving ( Eq, Show ) + +instance Default StyleProperties where + def = SProps { textProperties = Just def + , paraProperties = Just def + } + +data TextProperties = PropT { isEmphasised :: Bool + , isStrong :: Bool + , pitch :: Maybe FontPitch + , verticalPosition :: VerticalTextPosition + , underline :: Maybe UnderlineMode + , strikethrough :: Maybe UnderlineMode + } + deriving ( Eq, Show ) + +instance Default TextProperties where + def = PropT { isEmphasised = False + , isStrong = False + , pitch = Just def + , verticalPosition = def + , underline = Nothing + , strikethrough = Nothing + } + +data ParaProperties = PropP { paraNumbering :: ParaNumbering + , indentation :: LengthOrPercent + , margin_left :: LengthOrPercent + } + deriving ( Eq, Show ) + +instance Default ParaProperties where + def = PropP { paraNumbering = NumberingNone + , indentation = def + , margin_left = def + } + +---- +-- All the little data types that make up the properties +---- + +data VerticalTextPosition = VPosNormal | VPosSuper | VPosSub + deriving ( Eq, Show ) + +instance Default VerticalTextPosition where + def = VPosNormal + +instance Read VerticalTextPosition where + readsPrec _ s = [ (VPosSub , s') | ("sub" , s') <- lexS ] + ++ [ (VPosSuper , s') | ("super" , s') <- lexS ] + ++ [ (signumToVPos n , s') | ( n , s') <- readPercent s ] + where + lexS = lex s + signumToVPos n | n < 0 = VPosSub + | n > 0 = VPosSuper + | otherwise = VPosNormal + +data UnderlineMode = UnderlineModeNormal | UnderlineModeSkipWhitespace + deriving ( Eq, Show ) + +instance Lookupable UnderlineMode where + lookupTable = [ ( "continuous" , UnderlineModeNormal ) + , ( "skip-white-space" , UnderlineModeSkipWhitespace ) + ] + + +data ParaNumbering = NumberingNone | NumberingKeep | NumberingRestart Int + deriving ( Eq, Show ) + +data LengthOrPercent = LengthValueMM Int | PercentValue Int + deriving ( Eq, Show ) + +instance Default LengthOrPercent where + def = LengthValueMM 0 + +instance Read LengthOrPercent where + readsPrec _ s = + [ (PercentValue percent , s' ) | (percent , s' ) <- readPercent s] + ++ [ (LengthValueMM lengthMM , s'') | (length' , s' ) <- reads s + , (unit , s'') <- reads s' + , let lengthMM = estimateInMillimeter + length' unit + ] + +data XslUnit = XslUnitMM | XslUnitCM + | XslUnitInch + | XslUnitPoints | XslUnitPica + | XslUnitPixel + | XslUnitEM + +instance Show XslUnit where + show XslUnitMM = "mm" + show XslUnitCM = "cm" + show XslUnitInch = "in" + show XslUnitPoints = "pt" + show XslUnitPica = "pc" + show XslUnitPixel = "px" + show XslUnitEM = "em" + +instance Read XslUnit where + readsPrec _ "mm" = [(XslUnitMM , "")] + readsPrec _ "cm" = [(XslUnitCM , "")] + readsPrec _ "in" = [(XslUnitInch , "")] + readsPrec _ "pt" = [(XslUnitPoints , "")] + readsPrec _ "pc" = [(XslUnitPica , "")] + readsPrec _ "px" = [(XslUnitPixel , "")] + readsPrec _ "em" = [(XslUnitEM , "")] + readsPrec _ _ = [] + +-- | Rough conversion of measures into millimeters. +-- Pixels and em's are actually implemetation dependant/relative measures, +-- so I could not really easily calculate anything exact here even if I wanted. +-- But I do not care about exactness right now, as I only use measures +-- to determine if a paragraph is "indented" or not. +estimateInMillimeter :: Int -> XslUnit -> Int +estimateInMillimeter n XslUnitMM = n +estimateInMillimeter n XslUnitCM = n * 10 +estimateInMillimeter n XslUnitInch = n * 25 -- \* 25.4 +estimateInMillimeter n XslUnitPoints = n `div` 3 -- \* 1/72 * 25.4 +estimateInMillimeter n XslUnitPica = n * 4 -- \* 12 * 1/72 * 25.4 +estimateInMillimeter n XslUnitPixel = n `div`3 -- \* 1/72 * 25.4 +estimateInMillimeter n XslUnitEM = n * 7 -- \* 16 * 1/72 * 25.4 + + +---- +-- List styles +---- + +type ListLevel = Int + +newtype ListStyle = ListStyle { levelStyles :: M.Map ListLevel ListLevelStyle + } + deriving ( Eq, Show ) + +-- +getListLevelStyle :: ListLevel -> ListStyle -> Maybe ListLevelStyle +getListLevelStyle level ListStyle{..} = + let (lower , exactHit , _) = M.splitLookup level levelStyles + in exactHit <|> fmap fst (M.maxView lower) + -- findBy (`M.lookup` levelStyles) [level, (level-1) .. 1] + -- \^ simpler, but in general less efficient + +data ListLevelStyle = ListLevelStyle { listLevelType :: ListLevelType + , listItemPrefix :: Maybe String + , listItemSuffix :: Maybe String + , listItemFormat :: ListItemNumberFormat + , listItemStart :: Int + } + deriving ( Eq, Ord ) + +instance Show ListLevelStyle where + show ListLevelStyle{..} = "<LLS|" + ++ (show listLevelType) + ++ "|" + ++ (maybeToString listItemPrefix) + ++ (show listItemFormat) + ++ (maybeToString listItemSuffix) + ++ ">" + where maybeToString = fromMaybe "" + +data ListLevelType = LltBullet | LltImage | LltNumbered + deriving ( Eq, Ord, Show ) + +data ListItemNumberFormat = LinfNone + | LinfNumber + | LinfRomanLC | LinfRomanUC + | LinfAlphaLC | LinfAlphaUC + | LinfString String + deriving ( Eq, Ord ) + +instance Show ListItemNumberFormat where + show LinfNone = "" + show LinfNumber = "1" + show LinfRomanLC = "i" + show LinfRomanUC = "I" + show LinfAlphaLC = "a" + show LinfAlphaUC = "A" + show (LinfString s) = s + +instance Default ListItemNumberFormat where + def = LinfNone + +instance Read ListItemNumberFormat where + readsPrec _ "" = [(LinfNone , "")] + readsPrec _ "1" = [(LinfNumber , "")] + readsPrec _ "i" = [(LinfRomanLC , "")] + readsPrec _ "I" = [(LinfRomanUC , "")] + readsPrec _ "a" = [(LinfAlphaLC , "")] + readsPrec _ "A" = [(LinfAlphaUC , "")] + readsPrec _ s = [(LinfString s , "")] + +-------------------------------------------------------------------------------- +-- Readers +-- +-- ...it seems like a whole lot of this should be automatically deriveable +-- or at least moveable into a class. Most of this is data concealed in +-- code. +-------------------------------------------------------------------------------- + +-- +readAllStyles :: StyleReader _x Styles +readAllStyles = ( readFontPitches + >>?! ( readAutomaticStyles + &&& readStyles )) + >>?%? chooseMax + -- all top elements are always on the same hierarchy level + +-- +readStyles :: StyleReader _x Styles +readStyles = executeIn NsOffice "styles" $ liftAsSuccess + $ liftA3 Styles + ( tryAll NsStyle "style" readStyle >>^ M.fromList ) + ( tryAll NsText "list-style" readListStyle >>^ M.fromList ) + ( tryAll NsStyle "default-style" readDefaultStyle >>^ M.fromList ) + +-- +readAutomaticStyles :: StyleReader _x Styles +readAutomaticStyles = executeIn NsOffice "automatic-styles" $ liftAsSuccess + $ liftA3 Styles + ( tryAll NsStyle "style" readStyle >>^ M.fromList ) + ( tryAll NsText "list-style" readListStyle >>^ M.fromList ) + ( returnV M.empty ) + +-- +readDefaultStyle :: StyleReader _x (StyleFamily, StyleProperties) +readDefaultStyle = lookupAttr NsStyle "family" + >>?! keepingTheValue readStyleProperties + +-- +readStyle :: StyleReader _x (StyleName,Style) +readStyle = findAttr NsStyle "name" + >>?! keepingTheValue + ( liftA4 Style + ( lookupAttr' NsStyle "family" ) + ( findAttr' NsStyle "parent-style-name" ) + ( findAttr' NsStyle "list-style-name" ) + readStyleProperties + ) + +-- +readStyleProperties :: StyleReaderSafe _x StyleProperties +readStyleProperties = liftA2 SProps + ( readTextProperties >>> choiceToMaybe ) + ( readParaProperties >>> choiceToMaybe ) + +-- +readTextProperties :: StyleReader _x TextProperties +readTextProperties = + executeIn NsStyle "text-properties" $ liftAsSuccess + ( liftA6 PropT + ( searchAttr NsXSL_FO "font-style" False isFontEmphasised ) + ( searchAttr NsXSL_FO "font-weight" False isFontBold ) + ( findPitch ) + ( getAttr NsStyle "text-position" ) + ( readUnderlineMode ) + ( readStrikeThroughMode ) + ) + where isFontEmphasised = [("normal",False),("italic",True),("oblique",True)] + isFontBold = ("normal",False):("bold",True) + :(map ((,True).show) ([100,200..900]::[Int])) + +readUnderlineMode :: StyleReaderSafe _x (Maybe UnderlineMode) +readUnderlineMode = readLineMode "text-underline-mode" + "text-underline-style" + +readStrikeThroughMode :: StyleReaderSafe _x (Maybe UnderlineMode) +readStrikeThroughMode = readLineMode "text-line-through-mode" + "text-line-through-style" + +readLineMode :: String -> String -> StyleReaderSafe _x (Maybe UnderlineMode) +readLineMode modeAttr styleAttr = proc x -> do + isUL <- searchAttr NsStyle styleAttr False isLinePresent -< x + mode <- lookupAttr' NsStyle modeAttr -< x + if isUL + then case mode of + Just m -> returnA -< Just m + Nothing -> returnA -< Just UnderlineModeNormal + else returnA -< Nothing + where + isLinePresent = [("none",False)] ++ map (,True) + [ "dash" , "dot-dash" , "dot-dot-dash" , "dotted" + , "long-dash" , "solid" , "wave" + ] + +-- +readParaProperties :: StyleReader _x ParaProperties +readParaProperties = + executeIn NsStyle "paragraph-properties" $ liftAsSuccess + ( liftA3 PropP + ( liftA2 readNumbering + ( isSet' NsText "number-lines" ) + ( readAttr' NsText "line-number" ) + ) + ( liftA2 readIndentation + ( isSetWithDefault NsStyle "auto-text-indent" False ) + ( getAttr NsXSL_FO "text-indent" ) + ) + ( getAttr NsXSL_FO "margin-left" ) + ) + where readNumbering (Just True) (Just n) = NumberingRestart n + readNumbering (Just True) _ = NumberingKeep + readNumbering _ _ = NumberingNone + + readIndentation False indent = indent + readIndentation True _ = def + +---- +-- List styles +---- + +-- +readListStyle :: StyleReader _x (StyleName, ListStyle) +readListStyle = + findAttr NsStyle "name" + >>?! keepingTheValue + ( liftA ListStyle + $ ( liftA3 SM.union3 + ( readListLevelStyles NsText "list-level-style-number" LltNumbered ) + ( readListLevelStyles NsText "list-level-style-bullet" LltBullet ) + ( readListLevelStyles NsText "list-level-style-image" LltImage ) + ) >>^ M.mapMaybe chooseMostSpecificListLevelStyle + ) +-- +readListLevelStyles :: Namespace -> ElementName + -> ListLevelType + -> StyleReaderSafe _x (SM.SetMap Int ListLevelStyle) +readListLevelStyles namespace elementName levelType = + ( tryAll namespace elementName (readListLevelStyle levelType) + >>^ SM.fromList + ) + +-- +readListLevelStyle :: ListLevelType -> StyleReader _x (Int, ListLevelStyle) +readListLevelStyle levelType = readAttr NsText "level" + >>?! keepingTheValue + ( liftA5 toListLevelStyle + ( returnV levelType ) + ( findAttr' NsStyle "num-prefix" ) + ( findAttr' NsStyle "num-suffix" ) + ( getAttr NsStyle "num-format" ) + ( findAttr' NsText "start-value" ) + ) + where + toListLevelStyle _ p s LinfNone b = ListLevelStyle LltBullet p s LinfNone (startValue b) + toListLevelStyle _ p s f@(LinfString _) b = ListLevelStyle LltBullet p s f (startValue b) + toListLevelStyle t p s f b = ListLevelStyle t p s f (startValue b) + startValue (Just "") = 1 + startValue (Just v) = if all isDigit v + then read v + else 1 + startValue Nothing = 1 + +-- +chooseMostSpecificListLevelStyle :: S.Set ListLevelStyle -> Maybe ListLevelStyle +chooseMostSpecificListLevelStyle ls | ls == mempty = Nothing + | otherwise = Just ( F.foldr1 select ls ) + where + select ( ListLevelStyle t1 p1 s1 f1 b1 ) + ( ListLevelStyle t2 p2 s2 f2 _ ) + = ListLevelStyle (select' t1 t2) (p1 <|> p2) (s1 <|> s2) (selectLinf f1 f2) b1 + select' LltNumbered _ = LltNumbered + select' _ LltNumbered = LltNumbered + select' _ _ = LltBullet + selectLinf LinfNone f2 = f2 + selectLinf f1 LinfNone = f1 + selectLinf (LinfString _) f2 = f2 + selectLinf f1 (LinfString _) = f1 + selectLinf f1 _ = f1 + + +-------------------------------------------------------------------------------- +-- Tools to access style data +-------------------------------------------------------------------------------- + +-- +lookupStyle :: StyleName -> Styles -> Maybe Style +lookupStyle name Styles{..} = M.lookup name stylesByName + +-- +lookupDefaultStyle :: StyleFamily -> Styles -> StyleProperties +lookupDefaultStyle family Styles{..} = fromMaybe def + (M.lookup family defaultStyleMap) + +-- +lookupDefaultStyle' :: Styles -> StyleFamily -> StyleProperties +lookupDefaultStyle' Styles{..} family = fromMaybe def + (M.lookup family defaultStyleMap) + +-- +getListStyle :: Style -> Styles -> Maybe ListStyle +getListStyle Style{..} styles = listStyle >>= (`lookupListStyleByName` styles) + +-- +lookupListStyleByName :: StyleName -> Styles -> Maybe ListStyle +lookupListStyleByName name Styles{..} = M.lookup name listStylesByName + + +-- | Returns a chain of parent of the current style. The direct parent will +-- be the first element of the list, followed by its parent and so on. +-- The current style is not in the list. +parents :: Style -> Styles -> [Style] +parents style styles = unfoldr findNextParent style -- Ha! + where findNextParent Style{..} + = fmap duplicate $ (`lookupStyle` styles) =<< styleParentName + +-- | Looks up the style family of the current style. Normally, every style +-- should have one. But if not, all parents are searched. +getStyleFamily :: Style -> Styles -> Maybe StyleFamily +getStyleFamily style@Style{..} styles + = styleFamily + <|> (F.asum $ map (`getStyleFamily` styles) $ parents style styles) + +-- | Each 'Style' has certain 'StyleProperties'. But sometimes not all property +-- values are specified. Instead, a value might be inherited from a +-- parent style. This function makes this chain of inheritance +-- concrete and easily accessible by encapsulating the necessary lookups. +-- The resulting list contains the direct properties of the style as the first +-- element, the ones of the direct parent element as the next one, and so on. +-- +-- Note: There should also be default properties for each style family. These +-- are @not@ contained in this list because properties inherited from +-- parent elements take precedence over default styles. +-- +-- This function is primarily meant to be used through convenience wrappers. +-- +stylePropertyChain :: Style -> Styles -> [StyleProperties] +stylePropertyChain style styles + = map styleProperties (style : parents style styles) + +-- +extendedStylePropertyChain :: [Style] -> Styles -> [StyleProperties] +extendedStylePropertyChain [] _ = [] +extendedStylePropertyChain [style] styles = (stylePropertyChain style styles) + ++ (maybeToList (fmap (lookupDefaultStyle' styles) (getStyleFamily style styles))) +extendedStylePropertyChain (style:trace) styles = (stylePropertyChain style styles) + ++ (extendedStylePropertyChain trace styles) +-- Optimizable with Data.Sequence + +-- +extendedStylePropertyChain' :: [Style] -> Styles -> Maybe [StyleProperties] +extendedStylePropertyChain' [] _ = Nothing +extendedStylePropertyChain' [style] styles = Just ( + (stylePropertyChain style styles) + ++ (maybeToList (fmap (lookupDefaultStyle' styles) (getStyleFamily style styles))) + ) +extendedStylePropertyChain' (style:trace) styles = fmap ((stylePropertyChain style styles) ++) + (extendedStylePropertyChain' trace styles) + +-- +stylePropertyChain' :: Styles -> Style -> [StyleProperties] +stylePropertyChain' = flip stylePropertyChain + +-- +getStylePropertyChain :: StyleName -> Styles -> [StyleProperties] +getStylePropertyChain name styles = maybe [] + (`stylePropertyChain` styles) + (lookupStyle name styles) + +-- +getPropertyChain :: (StyleProperties -> Maybe a) -> Style -> Styles -> [a] +getPropertyChain extract style styles = catMaybes + $ map extract + $ stylePropertyChain style styles + +-- +textPropertyChain :: Style -> Styles -> [TextProperties] +textPropertyChain = getPropertyChain textProperties + +-- +paraPropertyChain :: Style -> Styles -> [ParaProperties] +paraPropertyChain = getPropertyChain paraProperties + +-- +getTextProperty :: (TextProperties -> a) -> Style -> Styles -> Maybe a +getTextProperty extract style styles = fmap extract + $ listToMaybe + $ textPropertyChain style styles + +-- +getTextProperty' :: (TextProperties -> Maybe a) -> Style -> Styles -> Maybe a +getTextProperty' extract style styles = F.asum + $ map extract + $ textPropertyChain style styles + +-- +getParaProperty :: (ParaProperties -> a) -> Style -> Styles -> Maybe a +getParaProperty extract style styles = fmap extract + $ listToMaybe + $ paraPropertyChain style styles + +-- | Lifts the reader into another readers' state. +liftStyles :: (OdtConverterState s -> OdtConverterState Styles) + -> (OdtConverterState Styles -> OdtConverterState s ) + -> XMLReader s x x +liftStyles extract inject = switchState extract inject + $ convertingExtraState M.empty readAllStyles + diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs new file mode 100644 index 000000000..c8dbbf45a --- /dev/null +++ b/src/Text/Pandoc/Readers/Org.hs @@ -0,0 +1,62 @@ +{- +Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +Conversion of org-mode formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Org ( readOrg ) where + +import Text.Pandoc.Readers.Org.Blocks ( blockList, meta ) +import Text.Pandoc.Readers.Org.Parsing ( OrgParser, readWithM ) +import Text.Pandoc.Readers.Org.ParserState ( optionsToParserState ) + +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Definition +import Text.Pandoc.Error +import Text.Pandoc.Options + +import Control.Monad.Except ( throwError ) +import Control.Monad.Reader ( runReaderT ) + + +-- | Parse org-mode string and return a Pandoc document. +readOrg :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> m Pandoc +readOrg opts s = do + parsed <- flip runReaderT def $ + readWithM parseOrg (optionsToParserState opts) (s ++ "\n\n") + case parsed of + Right result -> return result + Left _ -> throwError $ PandocParseError "problem parsing org" + +-- +-- Parser +-- +parseOrg :: PandocMonad m => OrgParser m Pandoc +parseOrg = do + blocks' <- blockList + meta' <- meta + return $ Pandoc meta' blocks' diff --git a/src/Text/Pandoc/Readers/Org/BlockStarts.hs b/src/Text/Pandoc/Readers/Org/BlockStarts.hs new file mode 100644 index 000000000..5588c4552 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/BlockStarts.hs @@ -0,0 +1,137 @@ +{- +Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +Parsers for Org-mode inline elements. +-} +module Text.Pandoc.Readers.Org.BlockStarts + ( exampleLineStart + , hline + , noteMarker + , tableStart + , drawerStart + , headerStart + , metaLineStart + , latexEnvStart + , commentLineStart + , bulletListStart + , orderedListStart + , endOfBlock + ) where + +import Control.Monad ( void ) +import Text.Pandoc.Readers.Org.Parsing + +-- | Horizontal Line (five -- dashes or more) +hline :: Monad m => OrgParser m () +hline = try $ do + skipSpaces + string "-----" + many (char '-') + skipSpaces + newline + return () + +-- | Read the start of a header line, return the header level +headerStart :: Monad m => OrgParser m Int +headerStart = try $ + (length <$> many1 (char '*')) <* many1 (char ' ') <* updateLastPreCharPos + +tableStart :: Monad m => OrgParser m Char +tableStart = try $ skipSpaces *> char '|' + +latexEnvStart :: Monad m => OrgParser m String +latexEnvStart = try $ do + skipSpaces *> string "\\begin{" + *> latexEnvName + <* string "}" + <* blankline + where + latexEnvName :: Monad m => OrgParser m String + latexEnvName = try $ mappend <$> many1 alphaNum <*> option "" (string "*") + + +-- | Parses bullet list marker. +bulletListStart :: Monad m => OrgParser m () +bulletListStart = try $ + choice + [ () <$ skipSpaces <* oneOf "+-" <* skipSpaces1 + , () <$ skipSpaces1 <* char '*' <* skipSpaces1 + ] + +genericListStart :: Monad m + => OrgParser m String + -> OrgParser m Int +genericListStart listMarker = try $ + (+) <$> (length <$> many spaceChar) + <*> (length <$> listMarker <* many1 spaceChar) + +orderedListStart :: Monad m => OrgParser m Int +orderedListStart = genericListStart orderedListMarker + -- Ordered list markers allowed in org-mode + where orderedListMarker = mappend <$> many1 digit <*> (pure <$> oneOf ".)") + +drawerStart :: Monad m => OrgParser m String +drawerStart = try $ + skipSpaces *> drawerName <* skipSpaces <* newline + where drawerName = char ':' *> manyTill nonspaceChar (char ':') + +metaLineStart :: Monad m => OrgParser m () +metaLineStart = try $ skipSpaces <* string "#+" + +commentLineStart :: Monad m => OrgParser m () +commentLineStart = try $ skipSpaces <* string "# " + +exampleLineStart :: Monad m => OrgParser m () +exampleLineStart = () <$ try (skipSpaces *> string ": ") + +noteMarker :: Monad m => OrgParser m String +noteMarker = try $ do + char '[' + choice [ many1Till digit (char ']') + , (++) <$> string "fn:" + <*> many1Till (noneOf "\n\r\t ") (char ']') + ] + +-- | Succeeds if the parser is at the end of a block. +endOfBlock :: Monad m => OrgParser m () +endOfBlock = lookAhead . try $ do + void blankline <|> anyBlockStart + where + -- Succeeds if there is a new block starting at this position. + anyBlockStart :: Monad m => OrgParser m () + anyBlockStart = try . choice $ + [ exampleLineStart + , hline + , metaLineStart + , commentLineStart + , void noteMarker + , void tableStart + , void drawerStart + , void headerStart + , void latexEnvStart + , void bulletListStart + , void orderedListStart + ] + diff --git a/src/Text/Pandoc/Readers/Org/Blocks.hs b/src/Text/Pandoc/Readers/Org/Blocks.hs new file mode 100644 index 000000000..78ac8d0d1 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/Blocks.hs @@ -0,0 +1,979 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE ViewPatterns #-} +{- +Copyright (C) 2014-2017 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2017 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +Parsers for Org-mode block elements. +-} +module Text.Pandoc.Readers.Org.Blocks + ( blockList + , meta + ) where + +import Text.Pandoc.Readers.Org.BlockStarts +import Text.Pandoc.Readers.Org.Inlines +import Text.Pandoc.Readers.Org.Meta ( metaExport, metaKey, metaLine ) +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing +import Text.Pandoc.Readers.Org.Shared + ( cleanLinkString, isImageFilename, rundocBlockClass + , toRundocAttrib, translateLang ) + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Inlines, Blocks ) +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared ( compactify, compactifyDL, safeRead ) + +import Control.Monad ( foldM, guard, mzero, void ) +import Data.Char ( isSpace, toLower, toUpper) +import Data.Default ( Default ) +import Data.List ( foldl', isPrefixOf ) +import Data.Maybe ( fromMaybe, isNothing ) +import Data.Monoid ((<>)) + +-- +-- Org headers +-- +newtype Tag = Tag { fromTag :: String } + deriving (Show, Eq) + +-- | Create a tag containing the given string. +toTag :: String -> Tag +toTag = Tag + +-- | The key (also called name or type) of a property. +newtype PropertyKey = PropertyKey { fromKey :: String } + deriving (Show, Eq, Ord) + +-- | Create a property key containing the given string. Org mode keys are +-- case insensitive and are hence converted to lower case. +toPropertyKey :: String -> PropertyKey +toPropertyKey = PropertyKey . map toLower + +-- | The value assigned to a property. +newtype PropertyValue = PropertyValue { fromValue :: String } + +-- | Create a property value containing the given string. +toPropertyValue :: String -> PropertyValue +toPropertyValue = PropertyValue + +-- | Check whether the property value is non-nil (i.e. truish). +isNonNil :: PropertyValue -> Bool +isNonNil p = map toLower (fromValue p) `notElem` ["()", "{}", "nil"] + +-- | Key/value pairs from a PROPERTIES drawer +type Properties = [(PropertyKey, PropertyValue)] + +-- | Org mode headline (i.e. a document subtree). +data Headline = Headline + { headlineLevel :: Int + , headlineTodoMarker :: Maybe TodoMarker + , headlineText :: Inlines + , headlineTags :: [Tag] + , headlineProperties :: Properties + , headlineContents :: Blocks + , headlineChildren :: [Headline] + } + +-- +-- Parsing headlines and subtrees +-- + +-- | Read an Org mode headline and its contents (i.e. a document subtree). +-- @lvl@ gives the minimum acceptable level of the tree. +headline :: PandocMonad m => Int -> OrgParser m (F Headline) +headline lvl = try $ do + level <- headerStart + guard (lvl <= level) + todoKw <- optionMaybe todoKeyword + title <- trimInlinesF . mconcat <$> manyTill inline endOfTitle + tags <- option [] headerTags + newline + properties <- option mempty propertiesDrawer + contents <- blocks + children <- many (headline (level + 1)) + return $ do + title' <- title + contents' <- contents + children' <- sequence children + return $ Headline + { headlineLevel = level + , headlineTodoMarker = todoKw + , headlineText = title' + , headlineTags = tags + , headlineProperties = properties + , headlineContents = contents' + , headlineChildren = children' + } + where + endOfTitle :: Monad m => OrgParser m () + endOfTitle = void . lookAhead $ optional headerTags *> newline + + headerTags :: Monad m => OrgParser m [Tag] + headerTags = try $ + let tag = many1 (alphaNum <|> oneOf "@%#_") <* char ':' + in map toTag <$> (skipSpaces *> char ':' *> many1 tag <* skipSpaces) + +-- | Convert an Org mode headline (i.e. a document tree) into pandoc's Blocks +headlineToBlocks :: Monad m => Headline -> OrgParser m Blocks +headlineToBlocks hdln@(Headline {..}) = do + maxHeadlineLevels <- getExportSetting exportHeadlineLevels + case () of + _ | any isNoExportTag headlineTags -> return mempty + _ | any isArchiveTag headlineTags -> archivedHeadlineToBlocks hdln + _ | isCommentTitle headlineText -> return mempty + _ | headlineLevel >= maxHeadlineLevels -> headlineToHeaderWithList hdln + _ | otherwise -> headlineToHeaderWithContents hdln + +isNoExportTag :: Tag -> Bool +isNoExportTag = (== toTag "noexport") + +isArchiveTag :: Tag -> Bool +isArchiveTag = (== toTag "ARCHIVE") + +-- | Check if the title starts with COMMENT. +-- FIXME: This accesses builder internals not intended for use in situations +-- like these. Replace once keyword parsing is supported. +isCommentTitle :: Inlines -> Bool +isCommentTitle (B.toList -> (Str "COMMENT":_)) = True +isCommentTitle _ = False + +archivedHeadlineToBlocks :: Monad m => Headline -> OrgParser m Blocks +archivedHeadlineToBlocks hdln = do + archivedTreesOption <- getExportSetting exportArchivedTrees + case archivedTreesOption of + ArchivedTreesNoExport -> return mempty + ArchivedTreesExport -> headlineToHeaderWithContents hdln + ArchivedTreesHeadlineOnly -> headlineToHeader hdln + +headlineToHeaderWithList :: Monad m => Headline -> OrgParser m Blocks +headlineToHeaderWithList hdln@(Headline {..}) = do + maxHeadlineLevels <- getExportSetting exportHeadlineLevels + header <- headlineToHeader hdln + listElements <- sequence (map headlineToBlocks headlineChildren) + let listBlock = if null listElements + then mempty + else B.orderedList listElements + let headerText = if maxHeadlineLevels == headlineLevel + then header + else flattenHeader header + return $ headerText <> headlineContents <> listBlock + where + flattenHeader :: Blocks -> Blocks + flattenHeader blks = + case B.toList blks of + (Header _ _ inlns:_) -> B.para (B.fromList inlns) + _ -> mempty + +headlineToHeaderWithContents :: Monad m => Headline -> OrgParser m Blocks +headlineToHeaderWithContents hdln@(Headline {..}) = do + header <- headlineToHeader hdln + childrenBlocks <- mconcat <$> sequence (map headlineToBlocks headlineChildren) + return $ header <> headlineContents <> childrenBlocks + +headlineToHeader :: Monad m => Headline -> OrgParser m Blocks +headlineToHeader (Headline {..}) = do + exportTodoKeyword <- getExportSetting exportWithTodoKeywords + let todoText = if exportTodoKeyword + then case headlineTodoMarker of + Just kw -> todoKeywordToInlines kw <> B.space + Nothing -> mempty + else mempty + let text = tagTitle (todoText <> headlineText) headlineTags + let propAttr = propertiesToAttr headlineProperties + attr <- registerHeader propAttr headlineText + return $ B.headerWith attr headlineLevel text + +todoKeyword :: Monad m => OrgParser m TodoMarker +todoKeyword = try $ do + taskStates <- activeTodoMarkers <$> getState + let kwParser tdm = try $ (tdm <$ string (todoMarkerName tdm) <* spaceChar) + choice (map kwParser taskStates) + +todoKeywordToInlines :: TodoMarker -> Inlines +todoKeywordToInlines tdm = + let todoText = todoMarkerName tdm + todoState = map toLower . show $ todoMarkerState tdm + classes = [todoState, todoText] + in B.spanWith (mempty, classes, mempty) (B.str todoText) + +propertiesToAttr :: Properties -> Attr +propertiesToAttr properties = + let + toStringPair prop = (fromKey (fst prop), fromValue (snd prop)) + customIdKey = toPropertyKey "custom_id" + classKey = toPropertyKey "class" + unnumberedKey = toPropertyKey "unnumbered" + specialProperties = [customIdKey, classKey, unnumberedKey] + id' = fromMaybe mempty . fmap fromValue . lookup customIdKey $ properties + cls = fromMaybe mempty . fmap fromValue . lookup classKey $ properties + kvs' = map toStringPair . filter ((`notElem` specialProperties) . fst) + $ properties + isUnnumbered = + fromMaybe False . fmap isNonNil . lookup unnumberedKey $ properties + in + (id', words cls ++ (if isUnnumbered then ["unnumbered"] else []), kvs') + +tagTitle :: Inlines -> [Tag] -> Inlines +tagTitle title tags = title <> (mconcat $ map tagToInline tags) + +tagToInline :: Tag -> Inlines +tagToInline t = B.spanWith ("", ["tag"], [("data-tag-name", fromTag t)]) mempty + + +-- +-- parsing blocks +-- + +-- | Get a list of blocks. +blockList :: PandocMonad m => OrgParser m [Block] +blockList = do + initialBlocks <- blocks + headlines <- sequence <$> manyTill (headline 1) eof + st <- getState + headlineBlocks <- fmap mconcat . sequence . map headlineToBlocks $ runF headlines st + return . B.toList $ (runF initialBlocks st) <> headlineBlocks + +-- | Get the meta information safed in the state. +meta :: Monad m => OrgParser m Meta +meta = do + meta' <- metaExport + runF meta' <$> getState + +blocks :: PandocMonad m => OrgParser m (F Blocks) +blocks = mconcat <$> manyTill block (void (lookAhead headerStart) <|> eof) + +block :: PandocMonad m => OrgParser m (F Blocks) +block = choice [ mempty <$ blanklines + , table + , orgBlock + , figure + , example + , genericDrawer + , specialLine + , horizontalRule + , list + , latexFragment + , noteBlock + , paraOrPlain + ] <?> "block" + + +-- +-- Block Attributes +-- + +-- | Attributes that may be added to figures (like a name or caption). +data BlockAttributes = BlockAttributes + { blockAttrName :: Maybe String + , blockAttrLabel :: Maybe String + , blockAttrCaption :: Maybe (F Inlines) + , blockAttrKeyValues :: [(String, String)] + } + +-- | Convert BlockAttributes into pandoc Attr +attrFromBlockAttributes :: BlockAttributes -> Attr +attrFromBlockAttributes (BlockAttributes{..}) = + let + ident = fromMaybe mempty $ lookup "id" blockAttrKeyValues + classes = case lookup "class" blockAttrKeyValues of + Nothing -> [] + Just clsStr -> words clsStr + kv = filter ((`notElem` ["id", "class"]) . fst) blockAttrKeyValues + in (ident, classes, kv) + +stringyMetaAttribute :: Monad m => (String -> Bool) -> OrgParser m (String, String) +stringyMetaAttribute attrCheck = try $ do + metaLineStart + attrName <- map toUpper <$> many1Till nonspaceChar (char ':') + guard $ attrCheck attrName + skipSpaces + attrValue <- anyLine + return (attrName, attrValue) + +blockAttributes :: PandocMonad m => OrgParser m BlockAttributes +blockAttributes = try $ do + kv <- many (stringyMetaAttribute attrCheck) + let caption = foldl' (appendValues "CAPTION") Nothing kv + let kvAttrs = foldl' (appendValues "ATTR_HTML") Nothing kv + let name = lookup "NAME" kv + let label = lookup "LABEL" kv + caption' <- case caption of + Nothing -> return Nothing + Just s -> Just <$> parseFromString inlines (s ++ "\n") + kvAttrs' <- parseFromString keyValues . (++ "\n") $ fromMaybe mempty kvAttrs + return $ BlockAttributes + { blockAttrName = name + , blockAttrLabel = label + , blockAttrCaption = caption' + , blockAttrKeyValues = kvAttrs' + } + where + attrCheck :: String -> Bool + attrCheck attr = + case attr of + "NAME" -> True + "LABEL" -> True + "CAPTION" -> True + "ATTR_HTML" -> True + _ -> False + + appendValues :: String -> Maybe String -> (String, String) -> Maybe String + appendValues attrName accValue (key, value) = + if key /= attrName + then accValue + else case accValue of + Just acc -> Just $ acc ++ ' ':value + Nothing -> Just value + +keyValues :: Monad m => OrgParser m [(String, String)] +keyValues = try $ + manyTill ((,) <$> key <*> value) newline + where + key :: Monad m => OrgParser m String + key = try $ skipSpaces *> char ':' *> many1 nonspaceChar + + value :: Monad m => OrgParser m String + value = skipSpaces *> manyTill anyChar endOfValue + + endOfValue :: Monad m => OrgParser m () + endOfValue = + lookAhead $ (() <$ try (many1 spaceChar <* key)) + <|> () <$ newline + + +-- +-- Org Blocks (#+BEGIN_... / #+END_...) +-- + +-- | Read an org-mode block delimited by #+BEGIN_TYPE and #+END_TYPE. +orgBlock :: PandocMonad m => OrgParser m (F Blocks) +orgBlock = try $ do + blockAttrs <- blockAttributes + blkType <- blockHeaderStart + ($ blkType) $ + case (map toLower blkType) of + "export" -> exportBlock + "comment" -> rawBlockLines (const mempty) + "html" -> rawBlockLines (return . B.rawBlock (lowercase blkType)) + "latex" -> rawBlockLines (return . B.rawBlock (lowercase blkType)) + "ascii" -> rawBlockLines (return . B.rawBlock (lowercase blkType)) + "example" -> rawBlockLines (return . exampleCode) + "quote" -> parseBlockLines (fmap B.blockQuote) + "verse" -> verseBlock + "src" -> codeBlock blockAttrs + _ -> parseBlockLines $ + let (ident, classes, kv) = attrFromBlockAttributes blockAttrs + in fmap $ B.divWith (ident, classes ++ [blkType], kv) + where + blockHeaderStart :: Monad m => OrgParser m String + blockHeaderStart = try $ skipSpaces *> stringAnyCase "#+begin_" *> orgArgWord + + lowercase :: String -> String + lowercase = map toLower + +rawBlockLines :: Monad m => (String -> F Blocks) -> String -> OrgParser m (F Blocks) +rawBlockLines f blockType = (ignHeaders *> (f <$> rawBlockContent blockType)) + +parseBlockLines :: PandocMonad m => (F Blocks -> F Blocks) -> String -> OrgParser m (F Blocks) +parseBlockLines f blockType = (ignHeaders *> (f <$> parsedBlockContent)) + where + parsedBlockContent :: PandocMonad m => OrgParser m (F Blocks) + parsedBlockContent = try $ do + raw <- rawBlockContent blockType + parseFromString blocks (raw ++ "\n") + +-- | Read the raw string content of a block +rawBlockContent :: Monad m => String -> OrgParser m String +rawBlockContent blockType = try $ do + blkLines <- manyTill rawLine blockEnder + tabLen <- getOption readerTabStop + return + . unlines + . stripIndent + . map (tabsToSpaces tabLen . commaEscaped) + $ blkLines + where + rawLine :: Monad m => OrgParser m String + rawLine = try $ ("" <$ blankline) <|> anyLine + + blockEnder :: Monad m => OrgParser m () + blockEnder = try $ skipSpaces <* stringAnyCase ("#+end_" <> blockType) + + stripIndent :: [String] -> [String] + stripIndent strs = map (drop (shortestIndent strs)) strs + + shortestIndent :: [String] -> Int + shortestIndent = foldr min maxBound + . map (length . takeWhile isSpace) + . filter (not . null) + + tabsToSpaces :: Int -> String -> String + tabsToSpaces _ [] = [] + tabsToSpaces tabLen cs'@(c:cs) = + case c of + ' ' -> ' ':tabsToSpaces tabLen cs + '\t' -> (take tabLen $ repeat ' ') ++ tabsToSpaces tabLen cs + _ -> cs' + + commaEscaped :: String -> String + commaEscaped (',':cs@('*':_)) = cs + commaEscaped (',':cs@('#':'+':_)) = cs + commaEscaped (' ':cs) = ' ':commaEscaped cs + commaEscaped ('\t':cs) = '\t':commaEscaped cs + commaEscaped cs = cs + +-- | Read but ignore all remaining block headers. +ignHeaders :: Monad m => OrgParser m () +ignHeaders = (() <$ newline) <|> (() <$ anyLine) + +-- | Read a block containing code intended for export in specific backends +-- only. +exportBlock :: Monad m => String -> OrgParser m (F Blocks) +exportBlock blockType = try $ do + exportType <- skipSpaces *> orgArgWord <* ignHeaders + contents <- rawBlockContent blockType + returnF (B.rawBlock (map toLower exportType) contents) + +verseBlock :: PandocMonad m => String -> OrgParser m (F Blocks) +verseBlock blockType = try $ do + ignHeaders + content <- rawBlockContent blockType + fmap B.lineBlock . sequence + <$> mapM parseVerseLine (lines content) + where + -- replace initial spaces with nonbreaking spaces to preserve + -- indentation, parse the rest as normal inline + parseVerseLine :: PandocMonad m => String -> OrgParser m (F Inlines) + parseVerseLine cs = do + let (initialSpaces, indentedLine) = span isSpace cs + let nbspIndent = if null initialSpaces + then mempty + else B.str $ map (const '\160') initialSpaces + line <- parseFromString inlines (indentedLine ++ "\n") + return (trimInlinesF $ pure nbspIndent <> line) + +-- | Read a code block and the associated results block if present. Which of +-- boths blocks is included in the output is determined using the "exports" +-- argument in the block header. +codeBlock :: PandocMonad m => BlockAttributes -> String -> OrgParser m (F Blocks) +codeBlock blockAttrs blockType = do + skipSpaces + (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) + content <- rawBlockContent blockType + resultsContent <- trailingResultsBlock + let id' = fromMaybe mempty $ blockAttrName blockAttrs + let includeCode = exportsCode kv + let includeResults = exportsResults kv + let codeBlck = B.codeBlockWith ( id', classes, kv ) content + let labelledBlck = maybe (pure codeBlck) + (labelDiv codeBlck) + (blockAttrCaption blockAttrs) + let resultBlck = fromMaybe mempty resultsContent + return $ + (if includeCode then labelledBlck else mempty) <> + (if includeResults then resultBlck else mempty) + where + labelDiv :: Blocks -> F Inlines -> F Blocks + labelDiv blk value = + B.divWith nullAttr <$> (mappend <$> labelledBlock value <*> pure blk) + + labelledBlock :: F Inlines -> F Blocks + labelledBlock = fmap (B.plain . B.spanWith ("", ["label"], [])) + +exportsCode :: [(String, String)] -> Bool +exportsCode attrs = not (("rundoc-exports", "none") `elem` attrs + || ("rundoc-exports", "results") `elem` attrs) + +exportsResults :: [(String, String)] -> Bool +exportsResults attrs = ("rundoc-exports", "results") `elem` attrs + || ("rundoc-exports", "both") `elem` attrs + +trailingResultsBlock :: PandocMonad m => OrgParser m (Maybe (F Blocks)) +trailingResultsBlock = optionMaybe . try $ do + blanklines + stringAnyCase "#+RESULTS:" + blankline + block + +-- | Parse code block arguments +-- TODO: We currently don't handle switches. +codeHeaderArgs :: Monad m => OrgParser m ([String], [(String, String)]) +codeHeaderArgs = try $ do + language <- skipSpaces *> orgArgWord + _ <- skipSpaces *> (try $ switch `sepBy` (many1 spaceChar)) + parameters <- manyTill blockOption newline + let pandocLang = translateLang language + return $ + if hasRundocParameters parameters + then ( [ pandocLang, rundocBlockClass ] + , map toRundocAttrib (("language", language) : parameters) + ) + else ([ pandocLang ], parameters) + where + hasRundocParameters = not . null + +switch :: Monad m => OrgParser m (Char, Maybe String) +switch = try $ simpleSwitch <|> lineNumbersSwitch + where + simpleSwitch = (\c -> (c, Nothing)) <$> (oneOf "-+" *> letter) + lineNumbersSwitch = (\ls -> ('l', Just ls)) <$> + (string "-l \"" *> many1Till nonspaceChar (char '"')) + +blockOption :: Monad m => OrgParser m (String, String) +blockOption = try $ do + argKey <- orgArgKey + paramValue <- option "yes" orgParamValue + return (argKey, paramValue) + +orgParamValue :: Monad m => OrgParser m String +orgParamValue = try $ + skipSpaces + *> notFollowedBy (char ':' ) + *> many1 nonspaceChar + <* skipSpaces + +horizontalRule :: Monad m => OrgParser m (F Blocks) +horizontalRule = return B.horizontalRule <$ try hline + + +-- +-- Drawers +-- + +-- | A generic drawer which has no special meaning for org-mode. +-- Whether or not this drawer is included in the output depends on the drawers +-- export setting. +genericDrawer :: PandocMonad m => OrgParser m (F Blocks) +genericDrawer = try $ do + name <- map toUpper <$> drawerStart + content <- manyTill drawerLine (try drawerEnd) + state <- getState + -- Include drawer if it is explicitly included in or not explicitly excluded + -- from the list of drawers that should be exported. PROPERTIES drawers are + -- never exported. + case (exportDrawers . orgStateExportSettings $ state) of + _ | name == "PROPERTIES" -> return mempty + Left names | name `elem` names -> return mempty + Right names | name `notElem` names -> return mempty + _ -> drawerDiv name <$> parseLines content + where + parseLines :: PandocMonad m => [String] -> OrgParser m (F Blocks) + parseLines = parseFromString blocks . (++ "\n") . unlines + + drawerDiv :: String -> F Blocks -> F Blocks + drawerDiv drawerName = fmap $ B.divWith (mempty, [drawerName, "drawer"], mempty) + +drawerLine :: Monad m => OrgParser m String +drawerLine = anyLine + +drawerEnd :: Monad m => OrgParser m String +drawerEnd = try $ + skipSpaces *> stringAnyCase ":END:" <* skipSpaces <* newline + +-- | Read a :PROPERTIES: drawer and return the key/value pairs contained +-- within. +propertiesDrawer :: Monad m => OrgParser m Properties +propertiesDrawer = try $ do + drawerType <- drawerStart + guard $ map toUpper drawerType == "PROPERTIES" + manyTill property (try drawerEnd) + where + property :: Monad m => OrgParser m (PropertyKey, PropertyValue) + property = try $ (,) <$> key <*> value + + key :: Monad m => OrgParser m PropertyKey + key = fmap toPropertyKey . try $ + skipSpaces *> char ':' *> many1Till nonspaceChar (char ':') + + value :: Monad m => OrgParser m PropertyValue + value = fmap toPropertyValue . try $ + skipSpaces *> manyTill anyChar (try $ skipSpaces *> newline) + + +-- +-- Figures +-- + +-- | Figures or an image paragraph (i.e. an image on a line by itself). Only +-- images with a caption attribute are interpreted as figures. +figure :: PandocMonad m => OrgParser m (F Blocks) +figure = try $ do + figAttrs <- blockAttributes + src <- skipSpaces *> selfTarget <* skipSpaces <* endOfParagraph + case cleanLinkString src of + Nothing -> mzero + Just imgSrc -> do + guard (isImageFilename imgSrc) + let isFigure = not . isNothing $ blockAttrCaption figAttrs + return $ imageBlock isFigure figAttrs imgSrc + where + selfTarget :: PandocMonad m => OrgParser m String + selfTarget = try $ char '[' *> linkTarget <* char ']' + + imageBlock :: Bool -> BlockAttributes -> String -> F Blocks + imageBlock isFigure figAttrs imgSrc = + let + figName = fromMaybe mempty $ blockAttrName figAttrs + figLabel = fromMaybe mempty $ blockAttrLabel figAttrs + figCaption = fromMaybe mempty $ blockAttrCaption figAttrs + figKeyVals = blockAttrKeyValues figAttrs + attr = (figLabel, mempty, figKeyVals) + figTitle = (if isFigure then withFigPrefix else id) figName + in + B.para . B.imageWith attr imgSrc figTitle <$> figCaption + + withFigPrefix :: String -> String + withFigPrefix cs = + if "fig:" `isPrefixOf` cs + then cs + else "fig:" ++ cs + +-- | Succeeds if looking at the end of the current paragraph +endOfParagraph :: Monad m => OrgParser m () +endOfParagraph = try $ skipSpaces *> newline *> endOfBlock + + +-- +-- Examples +-- + +-- | Example code marked up by a leading colon. +example :: Monad m => OrgParser m (F Blocks) +example = try $ do + return . return . exampleCode =<< unlines <$> many1 exampleLine + where + exampleLine :: Monad m => OrgParser m String + exampleLine = try $ exampleLineStart *> anyLine + +exampleCode :: String -> Blocks +exampleCode = B.codeBlockWith ("", ["example"], []) + + +-- +-- Comments, Options and Metadata +-- + +specialLine :: PandocMonad m => OrgParser m (F Blocks) +specialLine = fmap return . try $ rawExportLine <|> metaLine <|> commentLine + +rawExportLine :: PandocMonad m => OrgParser m Blocks +rawExportLine = try $ do + metaLineStart + key <- metaKey + if key `elem` ["latex", "html", "texinfo", "beamer"] + then B.rawBlock key <$> anyLine + else mzero + +commentLine :: Monad m => OrgParser m Blocks +commentLine = commentLineStart *> anyLine *> pure mempty + + +-- +-- Tables +-- +data ColumnProperty = ColumnProperty + { columnAlignment :: Maybe Alignment + , columnRelWidth :: Maybe Int + } deriving (Show, Eq) + +instance Default ColumnProperty where + def = ColumnProperty Nothing Nothing + +data OrgTableRow = OrgContentRow (F [Blocks]) + | OrgAlignRow [ColumnProperty] + | OrgHlineRow + +-- OrgTable is strongly related to the pandoc table ADT. Using the same +-- (i.e. pandoc-global) ADT would mean that the reader would break if the +-- global structure was to be changed, which would be bad. The final table +-- should be generated using a builder function. +data OrgTable = OrgTable + { orgTableColumnProperties :: [ColumnProperty] + , orgTableHeader :: [Blocks] + , orgTableRows :: [[Blocks]] + } + +table :: PandocMonad m => OrgParser m (F Blocks) +table = try $ do + blockAttrs <- blockAttributes + lookAhead tableStart + do + rows <- tableRows + let caption = fromMaybe (return mempty) $ blockAttrCaption blockAttrs + return $ (<$> caption) . orgToPandocTable . normalizeTable =<< rowsToTable rows + +orgToPandocTable :: OrgTable + -> Inlines + -> Blocks +orgToPandocTable (OrgTable colProps heads lns) caption = + let totalWidth = if any (not . isNothing) (map columnRelWidth colProps) + then Just . sum $ map (fromMaybe 1 . columnRelWidth) colProps + else Nothing + in B.table caption (map (convertColProp totalWidth) colProps) heads lns + where + convertColProp :: Maybe Int -> ColumnProperty -> (Alignment, Double) + convertColProp totalWidth colProp = + let + align' = fromMaybe AlignDefault $ columnAlignment colProp + width' = fromMaybe 0 $ (\w t -> (fromIntegral w / fromIntegral t)) + <$> (columnRelWidth colProp) + <*> totalWidth + in (align', width') + +tableRows :: PandocMonad m => OrgParser m [OrgTableRow] +tableRows = try $ many (tableAlignRow <|> tableHline <|> tableContentRow) + +tableContentRow :: PandocMonad m => OrgParser m OrgTableRow +tableContentRow = try $ + OrgContentRow . sequence <$> (tableStart *> many1Till tableContentCell newline) + +tableContentCell :: PandocMonad m => OrgParser m (F Blocks) +tableContentCell = try $ + fmap B.plain . trimInlinesF . mconcat <$> manyTill inline endOfCell + +tableAlignRow :: Monad m => OrgParser m OrgTableRow +tableAlignRow = try $ do + tableStart + colProps <- many1Till columnPropertyCell newline + -- Empty rows are regular (i.e. content) rows, not alignment rows. + guard $ any (/= def) colProps + return $ OrgAlignRow colProps + +columnPropertyCell :: Monad m => OrgParser m ColumnProperty +columnPropertyCell = emptyCell <|> propCell <?> "alignment info" + where + emptyCell = ColumnProperty Nothing Nothing <$ (try $ skipSpaces *> endOfCell) + propCell = try $ ColumnProperty + <$> (skipSpaces + *> char '<' + *> optionMaybe tableAlignFromChar) + <*> (optionMaybe (many1 digit >>= safeRead) + <* char '>' + <* emptyCell) + +tableAlignFromChar :: Monad m => OrgParser m Alignment +tableAlignFromChar = try $ + choice [ char 'l' *> return AlignLeft + , char 'c' *> return AlignCenter + , char 'r' *> return AlignRight + ] + +tableHline :: Monad m => OrgParser m OrgTableRow +tableHline = try $ + OrgHlineRow <$ (tableStart *> char '-' *> anyLine) + +endOfCell :: Monad m => OrgParser m Char +endOfCell = try $ char '|' <|> lookAhead newline + +rowsToTable :: [OrgTableRow] + -> F OrgTable +rowsToTable = foldM rowToContent emptyTable + where emptyTable = OrgTable mempty mempty mempty + +normalizeTable :: OrgTable -> OrgTable +normalizeTable (OrgTable colProps heads rows) = + OrgTable colProps' heads rows + where + refRow = if heads /= mempty + then heads + else case rows of + (r:_) -> r + _ -> mempty + cols = length refRow + fillColumns base padding = take cols $ base ++ repeat padding + colProps' = fillColumns colProps def + +-- One or more horizontal rules after the first content line mark the previous +-- line as a header. All other horizontal lines are discarded. +rowToContent :: OrgTable + -> OrgTableRow + -> F OrgTable +rowToContent orgTable row = + case row of + OrgHlineRow -> return singleRowPromotedToHeader + OrgAlignRow props -> return . setProperties $ props + OrgContentRow cs -> appendToBody cs + where + singleRowPromotedToHeader :: OrgTable + singleRowPromotedToHeader = case orgTable of + OrgTable{ orgTableHeader = [], orgTableRows = b:[] } -> + orgTable{ orgTableHeader = b , orgTableRows = [] } + _ -> orgTable + + setProperties :: [ColumnProperty] -> OrgTable + setProperties ps = orgTable{ orgTableColumnProperties = ps } + + appendToBody :: F [Blocks] -> F OrgTable + appendToBody frow = do + newRow <- frow + let oldRows = orgTableRows orgTable + -- NOTE: This is an inefficient O(n) operation. This should be changed + -- if performance ever becomes a problem. + return orgTable{ orgTableRows = oldRows ++ [newRow] } + + +-- +-- LaTeX fragments +-- +latexFragment :: Monad m => OrgParser m (F Blocks) +latexFragment = try $ do + envName <- latexEnvStart + content <- mconcat <$> manyTill anyLineNewline (latexEnd envName) + return . return $ B.rawBlock "latex" (content `inLatexEnv` envName) + where + c `inLatexEnv` e = mconcat [ "\\begin{", e, "}\n" + , c + , "\\end{", e, "}\n" + ] + +latexEnd :: Monad m => String -> OrgParser m () +latexEnd envName = try $ + () <$ skipSpaces + <* string ("\\end{" ++ envName ++ "}") + <* blankline + + +-- +-- Footnote defintions +-- +noteBlock :: PandocMonad m => OrgParser m (F Blocks) +noteBlock = try $ do + ref <- noteMarker <* skipSpaces + content <- mconcat <$> blocksTillHeaderOrNote + addToNotesTable (ref, content) + return mempty + where + blocksTillHeaderOrNote = + many1Till block (eof <|> () <$ lookAhead noteMarker + <|> () <$ lookAhead headerStart) + +-- Paragraphs or Plain text +paraOrPlain :: PandocMonad m => OrgParser m (F Blocks) +paraOrPlain = try $ do + -- Make sure we are not looking at a headline + notFollowedBy' (char '*' *> (oneOf " *")) + ils <- inlines + nl <- option False (newline *> return True) + -- Read block as paragraph, except if we are in a list context and the block + -- is directly followed by a list item, in which case the block is read as + -- plain text. + try (guard nl + *> notFollowedBy (inList *> (() <$ orderedListStart <|> bulletListStart)) + *> return (B.para <$> ils)) + <|> (return (B.plain <$> ils)) + + +-- +-- list blocks +-- + +list :: PandocMonad m => OrgParser m (F Blocks) +list = choice [ definitionList, bulletList, orderedList ] <?> "list" + +definitionList :: PandocMonad m => OrgParser m (F Blocks) +definitionList = try $ do n <- lookAhead (bulletListStart' Nothing) + fmap B.definitionList . fmap compactifyDL . sequence + <$> many1 (definitionListItem $ bulletListStart' (Just n)) + +bulletList :: PandocMonad m => OrgParser m (F Blocks) +bulletList = try $ do n <- lookAhead (bulletListStart' Nothing) + fmap B.bulletList . fmap compactify . sequence + <$> many1 (listItem (bulletListStart' $ Just n)) + +orderedList :: PandocMonad m => OrgParser m (F Blocks) +orderedList = fmap B.orderedList . fmap compactify . sequence + <$> many1 (listItem orderedListStart) + +bulletListStart' :: Monad m => Maybe Int -> OrgParser m Int +-- returns length of bulletList prefix, inclusive of marker +bulletListStart' Nothing = do ind <- length <$> many spaceChar + oneOf (bullets $ ind == 0) + skipSpaces1 + return (ind + 1) +bulletListStart' (Just n) = do count (n-1) spaceChar + oneOf (bullets $ n == 1) + many1 spaceChar + return n + +-- Unindented lists are legal, but they can't use '*' bullets. +-- We return n to maintain compatibility with the generic listItem. +bullets :: Bool -> String +bullets unindented = if unindented then "+-" else "*+-" + +definitionListItem :: PandocMonad m + => OrgParser m Int + -> OrgParser m (F (Inlines, [Blocks])) +definitionListItem parseMarkerGetLength = try $ do + markerLength <- parseMarkerGetLength + term <- manyTill (noneOf "\n\r") (try definitionMarker) + line1 <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + cont <- concat <$> many (listContinuation markerLength) + term' <- parseFromString inlines term + contents' <- parseFromString blocks $ line1 ++ blank ++ cont + return $ (,) <$> term' <*> fmap (:[]) contents' + where + definitionMarker = + spaceChar *> string "::" <* (spaceChar <|> lookAhead newline) + + +-- parse raw text for one list item, excluding start marker and continuations +listItem :: PandocMonad m + => OrgParser m Int + -> OrgParser m (F Blocks) +listItem start = try . withContext ListItemState $ do + markerLength <- try start + firstLine <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + rest <- concat <$> many (listContinuation markerLength) + parseFromString blocks $ firstLine ++ blank ++ rest + +-- continuation of a list item - indented and separated by blankline or endline. +-- Note: nested lists are parsed as continuations. +listContinuation :: Monad m => Int + -> OrgParser m String +listContinuation markerLength = try $ + notFollowedBy' blankline + *> (mappend <$> (concat <$> many1 listLine) + <*> many blankline) + where + listLine = try $ indentWith markerLength *> anyLineNewline + + -- indent by specified number of spaces (or equiv. tabs) + indentWith :: Monad m => Int -> OrgParser m String + indentWith num = do + tabStop <- getOption readerTabStop + if num < tabStop + then count num (char ' ') + else choice [ try (count num (char ' ')) + , try (char '\t' >> count (num - tabStop) (char ' ')) ] + +-- | Parse any line, include the final newline in the output. +anyLineNewline :: Monad m => OrgParser m String +anyLineNewline = (++ "\n") <$> anyLine diff --git a/src/Text/Pandoc/Readers/Org/ExportSettings.hs b/src/Text/Pandoc/Readers/Org/ExportSettings.hs new file mode 100644 index 000000000..391877c03 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/ExportSettings.hs @@ -0,0 +1,172 @@ +{- +Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +Parsers for Org-mode export options. +-} +module Text.Pandoc.Readers.Org.ExportSettings + ( exportSettings + ) where + +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing + +import Control.Monad ( mzero, void ) +import Data.Char ( toLower ) +import Data.Maybe ( listToMaybe ) + +-- | Read and handle space separated org-mode export settings. +exportSettings :: Monad m => OrgParser m () +exportSettings = void $ sepBy spaces exportSetting + +-- | Setter function for export settings. +type ExportSettingSetter a = a -> ExportSettings -> ExportSettings + +-- | Read and process a single org-mode export option. +exportSetting :: Monad m => OrgParser m () +exportSetting = choice + [ booleanSetting "^" (\val es -> es { exportSubSuperscripts = val }) + , booleanSetting "'" (\val es -> es { exportSmartQuotes = val }) + , booleanSetting "*" (\val es -> es { exportEmphasizedText = val }) + , booleanSetting "-" (\val es -> es { exportSpecialStrings = val }) + , ignoredSetting ":" + , ignoredSetting "<" + , ignoredSetting "\\n" + , archivedTreeSetting "arch" (\val es -> es { exportArchivedTrees = val }) + , booleanSetting "author" (\val es -> es { exportWithAuthor = val }) + , ignoredSetting "c" + -- org-mode allows the special value `comment` for creator, which we'll + -- interpret as true as it doesn't make sense in the context of Pandoc. + , booleanSetting "creator" (\val es -> es { exportWithCreator = val }) + , complementableListSetting "d" (\val es -> es { exportDrawers = val }) + , ignoredSetting "date" + , ignoredSetting "e" + , booleanSetting "email" (\val es -> es { exportWithEmail = val }) + , ignoredSetting "f" + , integerSetting "H" (\val es -> es { exportHeadlineLevels = val }) + , ignoredSetting "inline" + , ignoredSetting "num" + , ignoredSetting "p" + , ignoredSetting "pri" + , ignoredSetting "prop" + , ignoredSetting "stat" + , ignoredSetting "tags" + , ignoredSetting "tasks" + , ignoredSetting "tex" + , ignoredSetting "timestamp" + , ignoredSetting "title" + , ignoredSetting "toc" + , booleanSetting "todo" (\val es -> es { exportWithTodoKeywords = val }) + , ignoredSetting "|" + ] <?> "export setting" + +genericExportSetting :: Monad m + => OrgParser m a + -> String + -> ExportSettingSetter a + -> OrgParser m () +genericExportSetting optionParser settingIdentifier setter = try $ do + _ <- string settingIdentifier *> char ':' + value <- optionParser + updateState $ modifyExportSettings value + where + modifyExportSettings val st = + st { orgStateExportSettings = setter val . orgStateExportSettings $ st } + +-- | A boolean option, either nil (False) or non-nil (True). +booleanSetting :: Monad m => String -> ExportSettingSetter Bool -> OrgParser m () +booleanSetting = genericExportSetting elispBoolean + +-- | An integer-valued option. +integerSetting :: Monad m => String -> ExportSettingSetter Int -> OrgParser m () +integerSetting = genericExportSetting parseInt + where + parseInt = try $ + many1 digit >>= maybe mzero (return . fst) . listToMaybe . reads + +-- | Either the string "headline" or an elisp boolean and treated as an +-- @ArchivedTreesOption@. +archivedTreeSetting :: Monad m + => String + -> ExportSettingSetter ArchivedTreesOption + -> OrgParser m () +archivedTreeSetting = + genericExportSetting $ archivedTreesHeadlineSetting <|> archivedTreesBoolean + where + archivedTreesHeadlineSetting = try $ do + _ <- string "headline" + lookAhead (newline <|> spaceChar) + return ArchivedTreesHeadlineOnly + + archivedTreesBoolean = try $ do + exportBool <- elispBoolean + return $ + if exportBool + then ArchivedTreesExport + else ArchivedTreesNoExport + +-- | A list or a complement list (i.e. a list starting with `not`). +complementableListSetting :: Monad m + => String + -> ExportSettingSetter (Either [String] [String]) + -> OrgParser m () +complementableListSetting = genericExportSetting $ choice + [ Left <$> complementStringList + , Right <$> stringList + , (\b -> if b then Left [] else Right []) <$> elispBoolean + ] + where + -- Read a plain list of strings. + stringList :: Monad m => OrgParser m [String] + stringList = try $ + char '(' + *> sepBy elispString spaces + <* char ')' + + -- Read an emacs lisp list specifying a complement set. + complementStringList :: Monad m => OrgParser m [String] + complementStringList = try $ + string "(not " + *> sepBy elispString spaces + <* char ')' + + elispString :: Monad m => OrgParser m String + elispString = try $ + char '"' + *> manyTill alphaNum (char '"') + +-- | Read but ignore the export setting. +ignoredSetting :: Monad m => String -> OrgParser m () +ignoredSetting s = try (() <$ string s <* char ':' <* many1 nonspaceChar) + +-- | Read an elisp boolean. Only NIL is treated as false, non-NIL values are +-- interpreted as true. +elispBoolean :: Monad m => OrgParser m Bool +elispBoolean = try $ do + value <- many1 nonspaceChar + return $ case map toLower value of + "nil" -> False + "{}" -> False + "()" -> False + _ -> True diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs new file mode 100644 index 000000000..f3671641a --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/Inlines.hs @@ -0,0 +1,880 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +Parsers for Org-mode inline elements. +-} +module Text.Pandoc.Readers.Org.Inlines + ( inline + , inlines + , addToNotesTable + , linkTarget + ) where + +import Text.Pandoc.Readers.Org.BlockStarts ( endOfBlock, noteMarker ) +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing +import Text.Pandoc.Readers.Org.Shared + ( cleanLinkString, isImageFilename, rundocBlockClass + , toRundocAttrib, translateLang ) + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Inlines ) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Readers.LaTeX ( inlineCommand, rawLaTeXInline ) +import Text.TeXMath ( readTeX, writePandoc, DisplayType(..) ) +import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap +import Text.Pandoc.Class (PandocMonad) + +import Prelude hiding (sequence) +import Control.Monad ( guard, mplus, mzero, when, void ) +import Control.Monad.Trans ( lift ) +import Data.Char ( isAlphaNum, isSpace ) +import Data.List ( intersperse ) +import Data.Maybe ( fromMaybe ) +import qualified Data.Map as M +import Data.Monoid ( (<>) ) +import Data.Traversable (sequence) + +-- +-- Functions acting on the parser state +-- +recordAnchorId :: PandocMonad m => String -> OrgParser m () +recordAnchorId i = updateState $ \s -> + s{ orgStateAnchorIds = i : (orgStateAnchorIds s) } + +pushToInlineCharStack :: PandocMonad m => Char -> OrgParser m () +pushToInlineCharStack c = updateState $ \s -> + s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s } + +popInlineCharStack :: PandocMonad m => OrgParser m () +popInlineCharStack = updateState $ \s -> + s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s } + +surroundingEmphasisChar :: PandocMonad m => OrgParser m [Char] +surroundingEmphasisChar = + take 1 . drop 1 . orgStateEmphasisCharStack <$> getState + +startEmphasisNewlinesCounting :: PandocMonad m => Int -> OrgParser m () +startEmphasisNewlinesCounting maxNewlines = updateState $ \s -> + s{ orgStateEmphasisNewlines = Just maxNewlines } + +decEmphasisNewlinesCount :: PandocMonad m => OrgParser m () +decEmphasisNewlinesCount = updateState $ \s -> + s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s } + +newlinesCountWithinLimits :: PandocMonad m => OrgParser m Bool +newlinesCountWithinLimits = do + st <- getState + return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True + +resetEmphasisNewlines :: PandocMonad m => OrgParser m () +resetEmphasisNewlines = updateState $ \s -> + s{ orgStateEmphasisNewlines = Nothing } + +addToNotesTable :: PandocMonad m => OrgNoteRecord -> OrgParser m () +addToNotesTable note = do + oldnotes <- orgStateNotes' <$> getState + updateState $ \s -> s{ orgStateNotes' = note:oldnotes } + +-- | Parse a single Org-mode inline element +inline :: PandocMonad m => OrgParser m (F Inlines) +inline = + choice [ whitespace + , linebreak + , cite + , footnote + , linkOrImage + , anchor + , inlineCodeBlock + , str + , endline + , emphasizedText + , code + , math + , displayMath + , verbatim + , subscript + , superscript + , inlineLaTeX + , exportSnippet + , smart + , symbol + ] <* (guard =<< newlinesCountWithinLimits) + <?> "inline" + +-- | Read the rest of the input as inlines. +inlines :: PandocMonad m => OrgParser m (F Inlines) +inlines = trimInlinesF . mconcat <$> many1 inline + +-- treat these as potentially non-text when parsing inline: +specialChars :: [Char] +specialChars = "\"$'()*+-,./:;<=>@[\\]^_{|}~" + + +whitespace :: PandocMonad m => OrgParser m (F Inlines) +whitespace = pure B.space <$ skipMany1 spaceChar + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + <?> "whitespace" + +linebreak :: PandocMonad m => OrgParser m (F Inlines) +linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline + +str :: PandocMonad m => OrgParser m (F Inlines) +str = return . B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") + <* updateLastStrPos + +-- | An endline character that can be treated as a space, not a structural +-- break. This should reflect the values of the Emacs variable +-- @org-element-pagaraph-separate@. +endline :: PandocMonad m => OrgParser m (F Inlines) +endline = try $ do + newline + notFollowedBy' endOfBlock + decEmphasisNewlinesCount + guard =<< newlinesCountWithinLimits + updateLastPreCharPos + return . return $ B.softbreak + + +-- +-- Citations +-- + +-- The state of citations is a bit confusing due to the lack of an official +-- syntax and multiple syntaxes coexisting. The pandocOrgCite syntax was the +-- first to be implemented here and is almost identical to Markdown's citation +-- syntax. The org-ref package is in wide use to handle citations, but the +-- syntax is a bit limiting and not quite as simple to write. The +-- semi-offical Org-mode citation syntax is based on John MacFarlane's Pandoc +-- sytax and Org-oriented enhancements contributed by Richard Lawrence and +-- others. It's dubbed Berkeley syntax due the place of activity of its main +-- contributors. All this should be consolidated once an official Org-mode +-- citation syntax has emerged. + +cite :: PandocMonad m => OrgParser m (F Inlines) +cite = try $ berkeleyCite <|> do + guardEnabled Ext_citations + (cs, raw) <- withRaw $ choice + [ pandocOrgCite + , orgRefCite + , berkeleyTextualCite + ] + return $ (flip B.cite (B.text raw)) <$> cs + +-- | A citation in Pandoc Org-mode style (@[prefix \@citekey suffix]@). +pandocOrgCite :: PandocMonad m => OrgParser m (F [Citation]) +pandocOrgCite = try $ + char '[' *> skipSpaces *> citeList <* skipSpaces <* char ']' + +orgRefCite :: PandocMonad m => OrgParser m (F [Citation]) +orgRefCite = try $ choice + [ normalOrgRefCite + , fmap (:[]) <$> linkLikeOrgRefCite + ] + +normalOrgRefCite :: PandocMonad m => OrgParser m (F [Citation]) +normalOrgRefCite = try $ do + mode <- orgRefCiteMode + firstCitation <- orgRefCiteList mode + moreCitations <- many (try $ char ',' *> orgRefCiteList mode) + return . sequence $ firstCitation : moreCitations + where + -- | A list of org-ref style citation keys, parsed as citation of the given + -- citation mode. + orgRefCiteList :: PandocMonad m => CitationMode -> OrgParser m (F Citation) + orgRefCiteList citeMode = try $ do + key <- orgRefCiteKey + returnF $ Citation + { citationId = key + , citationPrefix = mempty + , citationSuffix = mempty + , citationMode = citeMode + , citationNoteNum = 0 + , citationHash = 0 + } + +-- | Read an Berkeley-style Org-mode citation. Berkeley citation style was +-- develop and adjusted to Org-mode style by John MacFarlane and Richard +-- Lawrence, respectively, both philosophers at UC Berkeley. +berkeleyCite :: PandocMonad m => OrgParser m (F Inlines) +berkeleyCite = try $ do + bcl <- berkeleyCitationList + return $ do + parens <- berkeleyCiteParens <$> bcl + prefix <- berkeleyCiteCommonPrefix <$> bcl + suffix <- berkeleyCiteCommonSuffix <$> bcl + citationList <- berkeleyCiteCitations <$> bcl + return $ + if parens + then toCite + . maybe id (\p -> alterFirst (prependPrefix p)) prefix + . maybe id (\s -> alterLast (appendSuffix s)) suffix + $ citationList + else maybe mempty (<> " ") prefix + <> (toListOfCites $ map toInTextMode citationList) + <> maybe mempty (", " <>) suffix + where + toCite :: [Citation] -> Inlines + toCite cs = B.cite cs mempty + + toListOfCites :: [Citation] -> Inlines + toListOfCites = mconcat . intersperse ", " . map (\c -> B.cite [c] mempty) + + toInTextMode :: Citation -> Citation + toInTextMode c = c { citationMode = AuthorInText } + + alterFirst, alterLast :: (a -> a) -> [a] -> [a] + alterFirst _ [] = [] + alterFirst f (c:cs) = (f c):cs + alterLast f = reverse . alterFirst f . reverse + + prependPrefix, appendSuffix :: Inlines -> Citation -> Citation + prependPrefix pre c = c { citationPrefix = B.toList pre <> citationPrefix c } + appendSuffix suf c = c { citationSuffix = citationSuffix c <> B.toList suf } + +data BerkeleyCitationList = BerkeleyCitationList + { berkeleyCiteParens :: Bool + , berkeleyCiteCommonPrefix :: Maybe Inlines + , berkeleyCiteCommonSuffix :: Maybe Inlines + , berkeleyCiteCitations :: [Citation] + } +berkeleyCitationList :: PandocMonad m => OrgParser m (F BerkeleyCitationList) +berkeleyCitationList = try $ do + char '[' + parens <- choice [ False <$ berkeleyBareTag, True <$ berkeleyParensTag ] + char ':' + skipSpaces + commonPrefix <- optionMaybe (try $ citationListPart <* char ';') + citations <- citeList + commonSuffix <- optionMaybe (try $ citationListPart) + char ']' + return (BerkeleyCitationList parens + <$> sequence commonPrefix + <*> sequence commonSuffix + <*> citations) + where + citationListPart :: PandocMonad m => OrgParser m (F Inlines) + citationListPart = fmap (trimInlinesF . mconcat) . try . many1 $ do + notFollowedBy' citeKey + notFollowedBy (oneOf ";]") + inline + +berkeleyBareTag :: PandocMonad m => OrgParser m () +berkeleyBareTag = try $ void berkeleyBareTag' + +berkeleyParensTag :: PandocMonad m => OrgParser m () +berkeleyParensTag = try . void $ enclosedByPair '(' ')' berkeleyBareTag' + +berkeleyBareTag' :: PandocMonad m => OrgParser m () +berkeleyBareTag' = try $ void (string "cite") + +berkeleyTextualCite :: PandocMonad m => OrgParser m (F [Citation]) +berkeleyTextualCite = try $ do + (suppressAuthor, key) <- citeKey + returnF . return $ Citation + { citationId = key + , citationPrefix = mempty + , citationSuffix = mempty + , citationMode = if suppressAuthor then SuppressAuthor else AuthorInText + , citationNoteNum = 0 + , citationHash = 0 + } + +-- The following is what a Berkeley-style bracketed textual citation parser +-- would look like. However, as these citations are a subset of Pandoc's Org +-- citation style, this isn't used. +-- berkeleyBracketedTextualCite :: PandocMonad m => OrgParser m (F [Citation]) +-- berkeleyBracketedTextualCite = try . (fmap head) $ +-- enclosedByPair '[' ']' berkeleyTextualCite + +-- | Read a link-like org-ref style citation. The citation includes pre and +-- post text. However, multiple citations are not possible due to limitations +-- in the syntax. +linkLikeOrgRefCite :: PandocMonad m => OrgParser m (F Citation) +linkLikeOrgRefCite = try $ do + _ <- string "[[" + mode <- orgRefCiteMode + key <- orgRefCiteKey + _ <- string "][" + pre <- trimInlinesF . mconcat <$> manyTill inline (try $ string "::") + spc <- option False (True <$ spaceChar) + suf <- trimInlinesF . mconcat <$> manyTill inline (try $ string "]]") + return $ do + pre' <- pre + suf' <- suf + return Citation + { citationId = key + , citationPrefix = B.toList pre' + , citationSuffix = B.toList (if spc then B.space <> suf' else suf') + , citationMode = mode + , citationNoteNum = 0 + , citationHash = 0 + } + +-- | Read a citation key. The characters allowed in citation keys are taken +-- from the `org-ref-cite-re` variable in `org-ref.el`. +orgRefCiteKey :: PandocMonad m => OrgParser m String +orgRefCiteKey = try . many1 . satisfy $ \c -> + isAlphaNum c || c `elem` ("-_:\\./"::String) + +-- | Supported citation types. Only a small subset of org-ref types is +-- supported for now. TODO: rewrite this, use LaTeX reader as template. +orgRefCiteMode :: PandocMonad m => OrgParser m CitationMode +orgRefCiteMode = + choice $ map (\(s, mode) -> mode <$ try (string s <* char ':')) + [ ("cite", AuthorInText) + , ("citep", NormalCitation) + , ("citep*", NormalCitation) + , ("citet", AuthorInText) + , ("citet*", AuthorInText) + , ("citeyear", SuppressAuthor) + ] + +citeList :: PandocMonad m => OrgParser m (F [Citation]) +citeList = sequence <$> sepEndBy1 citation (try $ char ';' *> skipSpaces) + +citation :: PandocMonad m => OrgParser m (F Citation) +citation = try $ do + pref <- prefix + (suppress_author, key) <- citeKey + suff <- suffix + return $ do + x <- pref + y <- suff + return $ Citation{ citationId = key + , citationPrefix = B.toList x + , citationSuffix = B.toList y + , citationMode = if suppress_author + then SuppressAuthor + else NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + where + prefix = trimInlinesF . mconcat <$> + manyTill inline (char ']' <|> (']' <$ lookAhead citeKey)) + suffix = try $ do + hasSpace <- option False (notFollowedBy nonspaceChar >> return True) + skipSpaces + rest <- trimInlinesF . mconcat <$> + many (notFollowedBy (oneOf ";]") *> inline) + return $ if hasSpace + then (B.space <>) <$> rest + else rest + +footnote :: PandocMonad m => OrgParser m (F Inlines) +footnote = try $ inlineNote <|> referencedNote + +inlineNote :: PandocMonad m => OrgParser m (F Inlines) +inlineNote = try $ do + string "[fn:" + ref <- many alphaNum + char ':' + note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']') + when (not $ null ref) $ + addToNotesTable ("fn:" ++ ref, note) + return $ B.note <$> note + +referencedNote :: PandocMonad m => OrgParser m (F Inlines) +referencedNote = try $ do + ref <- noteMarker + return $ do + notes <- asksF orgStateNotes' + case lookup ref notes of + Nothing -> return $ B.str $ "[" ++ ref ++ "]" + Just contents -> do + st <- askF + let contents' = runF contents st{ orgStateNotes' = [] } + return $ B.note contents' + +linkOrImage :: PandocMonad m => OrgParser m (F Inlines) +linkOrImage = explicitOrImageLink + <|> selflinkOrImage + <|> angleLink + <|> plainLink + <?> "link or image" + +explicitOrImageLink :: PandocMonad m => OrgParser m (F Inlines) +explicitOrImageLink = try $ do + char '[' + srcF <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget + title <- enclosedRaw (char '[') (char ']') + title' <- parseFromString (mconcat <$> many inline) title + char ']' + return $ do + src <- srcF + case cleanLinkString title of + Just imgSrc | isImageFilename imgSrc -> + pure $ B.link src "" $ B.image imgSrc mempty mempty + _ -> + linkToInlinesF src =<< title' + +selflinkOrImage :: PandocMonad m => OrgParser m (F Inlines) +selflinkOrImage = try $ do + src <- char '[' *> linkTarget <* char ']' + return $ linkToInlinesF src (B.str src) + +plainLink :: PandocMonad m => OrgParser m (F Inlines) +plainLink = try $ do + (orig, src) <- uri + returnF $ B.link src "" (B.str orig) + +angleLink :: PandocMonad m => OrgParser m (F Inlines) +angleLink = try $ do + char '<' + link <- plainLink + char '>' + return link + +linkTarget :: PandocMonad m => OrgParser m String +linkTarget = enclosedByPair '[' ']' (noneOf "\n\r[]") + +possiblyEmptyLinkTarget :: PandocMonad m => OrgParser m String +possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]") + +applyCustomLinkFormat :: String -> OrgParser m (F String) +applyCustomLinkFormat link = do + let (linkType, rest) = break (== ':') link + return $ do + formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters + return $ maybe link ($ drop 1 rest) formatter + +-- | Take a link and return a function which produces new inlines when given +-- description inlines. +linkToInlinesF :: String -> Inlines -> F Inlines +linkToInlinesF linkStr = + case linkStr of + "" -> pure . B.link mempty "" -- wiki link (empty by convention) + ('#':_) -> pure . B.link linkStr "" -- document-local fraction + _ -> case cleanLinkString linkStr of + (Just cleanedLink) -> if isImageFilename cleanedLink + then const . pure $ B.image cleanedLink "" "" + else pure . B.link cleanedLink "" + Nothing -> internalLink linkStr -- other internal link + +internalLink :: String -> Inlines -> F Inlines +internalLink link title = do + anchorB <- (link `elem`) <$> asksF orgStateAnchorIds + if anchorB + then return $ B.link ('#':link) "" title + else return $ B.emph title + +-- | Parse an anchor like @<<anchor-id>>@ and return an empty span with +-- @anchor-id@ set as id. Legal anchors in org-mode are defined through +-- @org-target-regexp@, which is fairly liberal. Since no link is created if +-- @anchor-id@ contains spaces, we are more restrictive in what is accepted as +-- an anchor. + +anchor :: PandocMonad m => OrgParser m (F Inlines) +anchor = try $ do + anchorId <- parseAnchor + recordAnchorId anchorId + returnF $ B.spanWith (solidify anchorId, [], []) mempty + where + parseAnchor = string "<<" + *> many1 (noneOf "\t\n\r<>\"' ") + <* string ">>" + <* skipSpaces + +-- | Replace every char but [a-zA-Z0-9_.-:] with a hypen '-'. This mirrors +-- the org function @org-export-solidify-link-text@. + +solidify :: String -> String +solidify = map replaceSpecialChar + where replaceSpecialChar c + | isAlphaNum c = c + | c `elem` ("_.-:" :: String) = c + | otherwise = '-' + +-- | Parses an inline code block and marks it as an babel block. +inlineCodeBlock :: PandocMonad m => OrgParser m (F Inlines) +inlineCodeBlock = try $ do + string "src_" + lang <- many1 orgArgWordChar + opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption + inlineCode <- enclosedByPair '{' '}' (noneOf "\n\r") + let attrClasses = [translateLang lang, rundocBlockClass] + let attrKeyVal = map toRundocAttrib (("language", lang) : opts) + returnF $ B.codeWith ("", attrClasses, attrKeyVal) inlineCode + where + inlineBlockOption :: PandocMonad m => OrgParser m (String, String) + inlineBlockOption = try $ do + argKey <- orgArgKey + paramValue <- option "yes" orgInlineParamValue + return (argKey, paramValue) + + orgInlineParamValue :: PandocMonad m => OrgParser m String + orgInlineParamValue = try $ + skipSpaces + *> notFollowedBy (char ':') + *> many1 (noneOf "\t\n\r ]") + <* skipSpaces + + +emphasizedText :: PandocMonad m => OrgParser m (F Inlines) +emphasizedText = do + state <- getState + guard . exportEmphasizedText . orgStateExportSettings $ state + try $ choice + [ emph + , strong + , strikeout + , underline + ] + +enclosedByPair :: PandocMonad m + => Char -- ^ opening char + -> Char -- ^ closing char + -> OrgParser m a -- ^ parser + -> OrgParser m [a] +enclosedByPair s e p = char s *> many1Till p (char e) + +emph :: PandocMonad m => OrgParser m (F Inlines) +emph = fmap B.emph <$> emphasisBetween '/' + +strong :: PandocMonad m => OrgParser m (F Inlines) +strong = fmap B.strong <$> emphasisBetween '*' + +strikeout :: PandocMonad m => OrgParser m (F Inlines) +strikeout = fmap B.strikeout <$> emphasisBetween '+' + +-- There is no underline, so we use strong instead. +underline :: PandocMonad m => OrgParser m (F Inlines) +underline = fmap B.strong <$> emphasisBetween '_' + +verbatim :: PandocMonad m => OrgParser m (F Inlines) +verbatim = return . B.code <$> verbatimBetween '=' + +code :: PandocMonad m => OrgParser m (F Inlines) +code = return . B.code <$> verbatimBetween '~' + +subscript :: PandocMonad m => OrgParser m (F Inlines) +subscript = fmap B.subscript <$> try (char '_' *> subOrSuperExpr) + +superscript :: PandocMonad m => OrgParser m (F Inlines) +superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr) + +math :: PandocMonad m => OrgParser m (F Inlines) +math = return . B.math <$> choice [ math1CharBetween '$' + , mathStringBetween '$' + , rawMathBetween "\\(" "\\)" + ] + +displayMath :: PandocMonad m => OrgParser m (F Inlines) +displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]" + , rawMathBetween "$$" "$$" + ] + +updatePositions :: PandocMonad m + => Char + -> OrgParser m Char +updatePositions c = do + when (c `elem` emphasisPreChars) updateLastPreCharPos + when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos + return c + +symbol :: PandocMonad m => OrgParser m (F Inlines) +symbol = return . B.str . (: "") <$> (oneOf specialChars >>= updatePositions) + +emphasisBetween :: PandocMonad m + => Char + -> OrgParser m (F Inlines) +emphasisBetween c = try $ do + startEmphasisNewlinesCounting emphasisAllowedNewlines + res <- enclosedInlines (emphasisStart c) (emphasisEnd c) + isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState + when isTopLevelEmphasis + resetEmphasisNewlines + return res + +verbatimBetween :: PandocMonad m + => Char + -> OrgParser m String +verbatimBetween c = try $ + emphasisStart c *> + many1TillNOrLessNewlines 1 verbatimChar (emphasisEnd c) + where + verbatimChar = noneOf "\n\r" >>= updatePositions + +-- | Parses a raw string delimited by @c@ using Org's math rules +mathStringBetween :: PandocMonad m + => Char + -> OrgParser m String +mathStringBetween c = try $ do + mathStart c + body <- many1TillNOrLessNewlines mathAllowedNewlines + (noneOf (c:"\n\r")) + (lookAhead $ mathEnd c) + final <- mathEnd c + return $ body ++ [final] + +-- | Parse a single character between @c@ using math rules +math1CharBetween :: PandocMonad m + => Char + -> OrgParser m String +math1CharBetween c = try $ do + char c + res <- noneOf $ c:mathForbiddenBorderChars + char c + eof <|> () <$ lookAhead (oneOf mathPostChars) + return [res] + +rawMathBetween :: PandocMonad m + => String + -> String + -> OrgParser m String +rawMathBetween s e = try $ string s *> manyTill anyChar (try $ string e) + +-- | Parses the start (opening character) of emphasis +emphasisStart :: PandocMonad m => Char -> OrgParser m Char +emphasisStart c = try $ do + guard =<< afterEmphasisPreChar + guard =<< notAfterString + char c + lookAhead (noneOf emphasisForbiddenBorderChars) + pushToInlineCharStack c + -- nested inlines are allowed, so mark this position as one which might be + -- followed by another inline. + updateLastPreCharPos + return c + +-- | Parses the closing character of emphasis +emphasisEnd :: PandocMonad m => Char -> OrgParser m Char +emphasisEnd c = try $ do + guard =<< notAfterForbiddenBorderChar + char c + eof <|> () <$ lookAhead acceptablePostChars + updateLastStrPos + popInlineCharStack + return c + where acceptablePostChars = + surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars) + +mathStart :: PandocMonad m => Char -> OrgParser m Char +mathStart c = try $ + char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars)) + +mathEnd :: PandocMonad m => Char -> OrgParser m Char +mathEnd c = try $ do + res <- noneOf (c:mathForbiddenBorderChars) + char c + eof <|> () <$ lookAhead (oneOf mathPostChars) + return res + + +enclosedInlines :: PandocMonad m => OrgParser m a + -> OrgParser m b + -> OrgParser m (F Inlines) +enclosedInlines start end = try $ + trimInlinesF . mconcat <$> enclosed start end inline + +enclosedRaw :: PandocMonad m => OrgParser m a + -> OrgParser m b + -> OrgParser m String +enclosedRaw start end = try $ + start *> (onSingleLine <|> spanningTwoLines) + where onSingleLine = try $ many1Till (noneOf "\n\r") end + spanningTwoLines = try $ + anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine + +-- | Like many1Till, but parses at most @n+1@ lines. @p@ must not consume +-- newlines. +many1TillNOrLessNewlines :: PandocMonad m => Int + -> OrgParser m Char + -> OrgParser m a + -> OrgParser m String +many1TillNOrLessNewlines n p end = try $ + nMoreLines (Just n) mempty >>= oneOrMore + where + nMoreLines Nothing cs = return cs + nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine + nMoreLines k cs = try $ (final k cs <|> rest k cs) + >>= uncurry nMoreLines + final _ cs = (\x -> (Nothing, cs ++ x)) <$> try finalLine + rest m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p newline) + finalLine = try $ manyTill p end + minus1 k = k - 1 + oneOrMore cs = guard (not $ null cs) *> return cs + +-- Org allows customization of the way it reads emphasis. We use the defaults +-- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components` +-- for details). + +-- | Chars allowed to occur before emphasis (spaces and newlines are ok, too) +emphasisPreChars :: [Char] +emphasisPreChars = "\t \"'({" + +-- | Chars allowed at after emphasis +emphasisPostChars :: [Char] +emphasisPostChars = "\t\n !\"'),-.:;?\\}" + +-- | Chars not allowed at the (inner) border of emphasis +emphasisForbiddenBorderChars :: [Char] +emphasisForbiddenBorderChars = "\t\n\r \"'," + +-- | The maximum number of newlines within +emphasisAllowedNewlines :: Int +emphasisAllowedNewlines = 1 + +-- LaTeX-style math: see `org-latex-regexps` for details + +-- | Chars allowed after an inline ($...$) math statement +mathPostChars :: [Char] +mathPostChars = "\t\n \"'),-.:;?" + +-- | Chars not allowed at the (inner) border of math +mathForbiddenBorderChars :: [Char] +mathForbiddenBorderChars = "\t\n\r ,;.$" + +-- | Maximum number of newlines in an inline math statement +mathAllowedNewlines :: Int +mathAllowedNewlines = 2 + +-- | Whether we are right behind a char allowed before emphasis +afterEmphasisPreChar :: PandocMonad m => OrgParser m Bool +afterEmphasisPreChar = do + pos <- getPosition + lastPrePos <- orgStateLastPreCharPos <$> getState + return . fromMaybe True $ (== pos) <$> lastPrePos + +-- | Whether the parser is right after a forbidden border char +notAfterForbiddenBorderChar :: PandocMonad m => OrgParser m Bool +notAfterForbiddenBorderChar = do + pos <- getPosition + lastFBCPos <- orgStateLastForbiddenCharPos <$> getState + return $ lastFBCPos /= Just pos + +-- | Read a sub- or superscript expression +subOrSuperExpr :: PandocMonad m => OrgParser m (F Inlines) +subOrSuperExpr = try $ + choice [ id <$> charsInBalanced '{' '}' (noneOf "\n\r") + , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r") + , simpleSubOrSuperString + ] >>= parseFromString (mconcat <$> many inline) + where enclosing (left, right) s = left : s ++ [right] + +simpleSubOrSuperString :: PandocMonad m => OrgParser m String +simpleSubOrSuperString = try $ do + state <- getState + guard . exportSubSuperscripts . orgStateExportSettings $ state + choice [ string "*" + , mappend <$> option [] ((:[]) <$> oneOf "+-") + <*> many1 alphaNum + ] + +inlineLaTeX :: PandocMonad m => OrgParser m (F Inlines) +inlineLaTeX = try $ do + cmd <- inlineLaTeXCommand + ils <- (lift . lift) $ parseAsInlineLaTeX cmd + maybe mzero returnF $ + parseAsMath cmd `mplus` parseAsMathMLSym cmd `mplus` ils + where + parseAsMath :: String -> Maybe Inlines + parseAsMath cs = B.fromList <$> texMathToPandoc cs + + parseAsInlineLaTeX :: PandocMonad m => String -> m (Maybe Inlines) + parseAsInlineLaTeX cs = maybeRight <$> runParserT inlineCommand state "" cs + + parseAsMathMLSym :: String -> Maybe Inlines + parseAsMathMLSym cs = B.str <$> MathMLEntityMap.getUnicode (clean cs) + -- drop initial backslash and any trailing "{}" + where clean = dropWhileEnd (`elem` ("{}" :: String)) . drop 1 + + state :: ParserState + state = def{ stateOptions = def{ readerExtensions = + enableExtension Ext_raw_tex (readerExtensions def) } } + + texMathToPandoc :: String -> Maybe [Inline] + texMathToPandoc cs = (maybeRight $ readTeX cs) >>= writePandoc DisplayInline + +maybeRight :: Either a b -> Maybe b +maybeRight = either (const Nothing) Just + +inlineLaTeXCommand :: PandocMonad m => OrgParser m String +inlineLaTeXCommand = try $ do + rest <- getInput + parsed <- (lift . lift) $ runParserT rawLaTeXInline def "source" rest + case parsed of + Right (RawInline _ cs) -> do + -- drop any trailing whitespace, those are not be part of the command as + -- far as org mode is concerned. + let cmdNoSpc = dropWhileEnd isSpace cs + let len = length cmdNoSpc + count len anyChar + return cmdNoSpc + _ -> mzero + +-- Taken from Data.OldList. +dropWhileEnd :: (a -> Bool) -> [a] -> [a] +dropWhileEnd p = foldr (\x xs -> if p x && null xs then [] else x : xs) [] + +exportSnippet :: PandocMonad m => OrgParser m (F Inlines) +exportSnippet = try $ do + string "@@" + format <- many1Till (alphaNum <|> char '-') (char ':') + snippet <- manyTill anyChar (try $ string "@@") + returnF $ B.rawInline format snippet + +smart :: PandocMonad m => OrgParser m (F Inlines) +smart = do + guardEnabled Ext_smart + doubleQuoted <|> singleQuoted <|> + choice (map (return <$>) [orgApostrophe, orgDash, orgEllipses]) + where + orgDash = do + guard =<< getExportSetting exportSpecialStrings + dash <* updatePositions '-' + orgEllipses = do + guard =<< getExportSetting exportSpecialStrings + ellipses <* updatePositions '.' + orgApostrophe = + (char '\'' <|> char '\8217') <* updateLastPreCharPos + <* updateLastForbiddenCharPos + *> return (B.str "\x2019") + +singleQuoted :: PandocMonad m => OrgParser m (F Inlines) +singleQuoted = try $ do + guard =<< getExportSetting exportSmartQuotes + singleQuoteStart + updatePositions '\'' + withQuoteContext InSingleQuote $ + fmap B.singleQuoted . trimInlinesF . mconcat <$> + many1Till inline (singleQuoteEnd <* updatePositions '\'') + +-- doubleQuoted will handle regular double-quoted sections, as well +-- as dialogues with an open double-quote without a close double-quote +-- in the same paragraph. +doubleQuoted :: PandocMonad m => OrgParser m (F Inlines) +doubleQuoted = try $ do + guard =<< getExportSetting exportSmartQuotes + doubleQuoteStart + updatePositions '"' + contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) + (withQuoteContext InDoubleQuote $ (doubleQuoteEnd <* updateLastForbiddenCharPos) >> return + (fmap B.doubleQuoted . trimInlinesF $ contents)) + <|> (return $ return (B.str "\8220") <> contents) diff --git a/src/Text/Pandoc/Readers/Org/Meta.hs b/src/Text/Pandoc/Readers/Org/Meta.hs new file mode 100644 index 000000000..2f4e21248 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/Meta.hs @@ -0,0 +1,218 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE TupleSections #-} +{- +Copyright (C) 2014-2017 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Meta + Copyright : Copyright (C) 2014-2017 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +Parsers for Org-mode meta declarations. +-} +module Text.Pandoc.Readers.Org.Meta + ( metaExport + , metaKey + , metaLine + ) where + +import Text.Pandoc.Readers.Org.BlockStarts +import Text.Pandoc.Readers.Org.ExportSettings ( exportSettings ) +import Text.Pandoc.Readers.Org.Inlines +import Text.Pandoc.Readers.Org.ParserState +import Text.Pandoc.Readers.Org.Parsing + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Blocks, Inlines ) +import Text.Pandoc.Class ( PandocMonad ) +import Text.Pandoc.Definition + +import Control.Monad ( mzero, void ) +import Data.Char ( toLower ) +import Data.List ( intersperse ) +import qualified Data.Map as M +import Data.Monoid ( (<>) ) +import Network.HTTP ( urlEncode ) + +-- | Returns the current meta, respecting export options. +metaExport :: Monad m => OrgParser m (F Meta) +metaExport = do + st <- getState + let settings = orgStateExportSettings st + return $ (if exportWithAuthor settings then id else removeMeta "author") + . (if exportWithCreator settings then id else removeMeta "creator") + . (if exportWithEmail settings then id else removeMeta "email") + <$> orgStateMeta st + +removeMeta :: String -> Meta -> Meta +removeMeta key meta' = + let metaMap = unMeta meta' + in Meta $ M.delete key metaMap + +-- | Parse and handle a single line containing meta information +-- The order, in which blocks are tried, makes sure that we're not looking at +-- the beginning of a block, so we don't need to check for it +metaLine :: PandocMonad m => OrgParser m Blocks +metaLine = mempty <$ metaLineStart <* (optionLine <|> declarationLine) + +declarationLine :: PandocMonad m => OrgParser m () +declarationLine = try $ do + key <- map toLower <$> metaKey + (key', value) <- metaValue key + updateState $ \st -> + let meta' = B.setMeta key' <$> value <*> pure nullMeta + in st { orgStateMeta = meta' <> orgStateMeta st } + +metaKey :: Monad m => OrgParser m String +metaKey = map toLower <$> many1 (noneOf ": \n\r") + <* char ':' + <* skipSpaces + +metaValue :: PandocMonad m => String -> OrgParser m (String, (F MetaValue)) +metaValue key = + let inclKey = "header-includes" + in case key of + "author" -> (key,) <$> metaInlinesCommaSeparated + "title" -> (key,) <$> metaInlines + "date" -> (key,) <$> metaInlines + "header-includes" -> (key,) <$> accumulatingList key metaInlines + "latex_header" -> (inclKey,) <$> + accumulatingList inclKey (metaExportSnippet "latex") + "latex_class" -> ("documentclass",) <$> metaString + -- Org-mode expects class options to contain the surrounding brackets, + -- pandoc does not. + "latex_class_options" -> ("classoption",) <$> + metaModifiedString (filter (`notElem` "[]")) + "html_head" -> (inclKey,) <$> + accumulatingList inclKey (metaExportSnippet "html") + _ -> (key,) <$> metaString + +metaInlines :: PandocMonad m => OrgParser m (F MetaValue) +metaInlines = fmap (MetaInlines . B.toList) <$> inlinesTillNewline + +metaInlinesCommaSeparated :: PandocMonad m => OrgParser m (F MetaValue) +metaInlinesCommaSeparated = do + authStrs <- (many1 (noneOf ",\n")) `sepBy1` (char ',') + newline + authors <- mapM (parseFromString inlinesTillNewline . (++ "\n")) authStrs + let toMetaInlines = MetaInlines . B.toList + return $ MetaList . map toMetaInlines <$> sequence authors + +metaString :: Monad m => OrgParser m (F MetaValue) +metaString = metaModifiedString id + +metaModifiedString :: Monad m => (String -> String) -> OrgParser m (F MetaValue) +metaModifiedString f = return . MetaString . f <$> anyLine + +-- | Read an format specific meta definition +metaExportSnippet :: Monad m => String -> OrgParser m (F MetaValue) +metaExportSnippet format = + return . MetaInlines . B.toList . B.rawInline format <$> anyLine + +-- | Accumulate the result of the @parser@ in a list under @key@. +accumulatingList :: Monad m => String + -> OrgParser m (F MetaValue) + -> OrgParser m (F MetaValue) +accumulatingList key p = do + value <- p + meta' <- orgStateMeta <$> getState + return $ (\m v -> MetaList (curList m ++ [v])) <$> meta' <*> value + where curList m = case lookupMeta key m of + Just (MetaList ms) -> ms + Just x -> [x] + _ -> [] + +-- +-- export options +-- +optionLine :: Monad m => OrgParser m () +optionLine = try $ do + key <- metaKey + case key of + "link" -> parseLinkFormat >>= uncurry addLinkFormat + "options" -> exportSettings + "todo" -> todoSequence >>= updateState . registerTodoSequence + "seq_todo" -> todoSequence >>= updateState . registerTodoSequence + "typ_todo" -> todoSequence >>= updateState . registerTodoSequence + _ -> mzero + +addLinkFormat :: Monad m => String + -> (String -> String) + -> OrgParser m () +addLinkFormat key formatter = updateState $ \s -> + let fs = orgStateLinkFormatters s + in s{ orgStateLinkFormatters = M.insert key formatter fs } + +parseLinkFormat :: Monad m => OrgParser m ((String, String -> String)) +parseLinkFormat = try $ do + linkType <- (:) <$> letter <*> many (alphaNum <|> oneOf "-_") <* skipSpaces + linkSubst <- parseFormat + return (linkType, linkSubst) + +-- | An ad-hoc, single-argument-only implementation of a printf-style format +-- parser. +parseFormat :: Monad m => OrgParser m (String -> String) +parseFormat = try $ do + replacePlain <|> replaceUrl <|> justAppend + where + -- inefficient, but who cares + replacePlain = try $ (\x -> concat . flip intersperse x) + <$> sequence [tillSpecifier 's', rest] + replaceUrl = try $ (\x -> concat . flip intersperse x . urlEncode) + <$> sequence [tillSpecifier 'h', rest] + justAppend = try $ (++) <$> rest + + rest = manyTill anyChar (eof <|> () <$ oneOf "\n\r") + tillSpecifier c = manyTill (noneOf "\n\r") (try $ string ('%':c:"")) + +inlinesTillNewline :: PandocMonad m => OrgParser m (F Inlines) +inlinesTillNewline = trimInlinesF . mconcat <$> manyTill inline newline + +-- +-- ToDo Sequences and Keywords +-- +todoSequence :: Monad m => OrgParser m TodoSequence +todoSequence = try $ do + todoKws <- todoKeywords + doneKws <- optionMaybe $ todoDoneSep *> todoKeywords + newline + -- There must be at least one DONE keyword. The last TODO keyword is taken if + -- necessary. + case doneKws of + Just done -> return $ keywordsToSequence todoKws done + Nothing -> case reverse todoKws of + [] -> mzero -- no keywords present + (x:xs) -> return $ keywordsToSequence (reverse xs) [x] + + where + todoKeywords :: Monad m => OrgParser m [String] + todoKeywords = try $ + let keyword = many1 nonspaceChar <* skipSpaces + endOfKeywords = todoDoneSep <|> void newline + in manyTill keyword (lookAhead endOfKeywords) + + todoDoneSep :: Monad m => OrgParser m () + todoDoneSep = void . try $ skipSpaces *> char '|' <* skipSpaces1 + + keywordsToSequence :: [String] -> [String] -> TodoSequence + keywordsToSequence todo done = + let todoMarkers = map (TodoMarker Todo) todo + doneMarkers = map (TodoMarker Done) done + in todoMarkers ++ doneMarkers diff --git a/src/Text/Pandoc/Readers/Org/ParserState.hs b/src/Text/Pandoc/Readers/Org/ParserState.hs new file mode 100644 index 000000000..181dd1d5c --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/ParserState.hs @@ -0,0 +1,259 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{- +Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +Define the Org-mode parser state. +-} +module Text.Pandoc.Readers.Org.ParserState + ( OrgParserState (..) + , OrgParserLocal (..) + , OrgNoteRecord + , HasReaderOptions (..) + , HasQuoteContext (..) + , TodoMarker (..) + , TodoSequence + , TodoState (..) + , activeTodoMarkers + , registerTodoSequence + , F(..) + , askF + , asksF + , trimInlinesF + , runF + , returnF + , ExportSettings (..) + , ArchivedTreesOption (..) + , optionsToParserState + ) where + +import Control.Monad (liftM, liftM2) +import Control.Monad.Reader (Reader, runReader, ReaderT, ask, asks, local) + +import Data.Default (Default(..)) +import qualified Data.Map as M +import qualified Data.Set as Set + +import Text.Pandoc.Builder ( Inlines, Blocks, trimInlines ) +import Text.Pandoc.Definition ( Meta(..), nullMeta ) +import Text.Pandoc.Options ( ReaderOptions(..) ) +import Text.Pandoc.Parsing ( HasHeaderMap(..) + , HasIdentifierList(..) + , HasLastStrPosition(..) + , HasQuoteContext(..) + , HasReaderOptions(..) + , ParserContext(..) + , QuoteContext(..) + , SourcePos ) + +-- | An inline note / footnote containing the note key and its (inline) value. +type OrgNoteRecord = (String, F Blocks) +-- | Table of footnotes +type OrgNoteTable = [OrgNoteRecord] +-- | Map of functions for link transformations. The map key is refers to the +-- link-type, the corresponding function transforms the given link string. +type OrgLinkFormatters = M.Map String (String -> String) + +-- | The states in which a todo item can be +data TodoState = Todo | Done + deriving (Eq, Ord, Show) + +-- | A ToDo keyword like @TODO@ or @DONE@. +data TodoMarker = TodoMarker + { todoMarkerState :: TodoState + , todoMarkerName :: String + } + deriving (Show, Eq) + +-- | Collection of todo markers in the order in which items should progress +type TodoSequence = [TodoMarker] + +-- | Org-mode parser state +data OrgParserState = OrgParserState + { orgStateAnchorIds :: [String] + , orgStateEmphasisCharStack :: [Char] + , orgStateEmphasisNewlines :: Maybe Int + , orgStateExportSettings :: ExportSettings + , orgStateHeaderMap :: M.Map Inlines String + , orgStateIdentifiers :: Set.Set String + , orgStateLastForbiddenCharPos :: Maybe SourcePos + , orgStateLastPreCharPos :: Maybe SourcePos + , orgStateLastStrPos :: Maybe SourcePos + , orgStateLinkFormatters :: OrgLinkFormatters + , orgStateMeta :: F Meta + , orgStateNotes' :: OrgNoteTable + , orgStateOptions :: ReaderOptions + , orgStateParserContext :: ParserContext + , orgStateTodoSequences :: [TodoSequence] + } + +data OrgParserLocal = OrgParserLocal { orgLocalQuoteContext :: QuoteContext } + +instance Default OrgParserLocal where + def = OrgParserLocal NoQuote + +instance HasReaderOptions OrgParserState where + extractReaderOptions = orgStateOptions + +instance HasLastStrPosition OrgParserState where + getLastStrPos = orgStateLastStrPos + setLastStrPos pos st = st{ orgStateLastStrPos = Just pos } + +instance Monad m => HasQuoteContext st (ReaderT OrgParserLocal m) where + getQuoteContext = asks orgLocalQuoteContext + withQuoteContext q = local (\s -> s{orgLocalQuoteContext = q}) + +instance HasIdentifierList OrgParserState where + extractIdentifierList = orgStateIdentifiers + updateIdentifierList f s = s{ orgStateIdentifiers = f (orgStateIdentifiers s) } + +instance HasHeaderMap OrgParserState where + extractHeaderMap = orgStateHeaderMap + updateHeaderMap f s = s{ orgStateHeaderMap = f (orgStateHeaderMap s) } + +instance Default OrgParserState where + def = defaultOrgParserState + +defaultOrgParserState :: OrgParserState +defaultOrgParserState = OrgParserState + { orgStateAnchorIds = [] + , orgStateEmphasisCharStack = [] + , orgStateEmphasisNewlines = Nothing + , orgStateExportSettings = def + , orgStateHeaderMap = M.empty + , orgStateIdentifiers = Set.empty + , orgStateLastForbiddenCharPos = Nothing + , orgStateLastPreCharPos = Nothing + , orgStateLastStrPos = Nothing + , orgStateLinkFormatters = M.empty + , orgStateMeta = return nullMeta + , orgStateNotes' = [] + , orgStateOptions = def + , orgStateParserContext = NullState + , orgStateTodoSequences = [] + } + +optionsToParserState :: ReaderOptions -> OrgParserState +optionsToParserState opts = + def { orgStateOptions = opts } + +registerTodoSequence :: TodoSequence -> OrgParserState -> OrgParserState +registerTodoSequence todoSeq st = + let curSeqs = orgStateTodoSequences st + in st{ orgStateTodoSequences = todoSeq : curSeqs } + +-- | Get the current todo/done sequences. If no custom todo sequences have been +-- defined, return a list containing just the default todo/done sequence. +activeTodoSequences :: OrgParserState -> [TodoSequence] +activeTodoSequences st = + let curSeqs = orgStateTodoSequences st + in if null curSeqs + then [[ TodoMarker Todo "TODO" , TodoMarker Done "DONE" ]] + else curSeqs + +activeTodoMarkers :: OrgParserState -> TodoSequence +activeTodoMarkers = concat . activeTodoSequences + + +-- +-- Export Settings +-- + +-- | Options for the way archived trees are handled. +data ArchivedTreesOption = + ArchivedTreesExport -- ^ Export the complete tree + | ArchivedTreesNoExport -- ^ Exclude archived trees from exporting + | ArchivedTreesHeadlineOnly -- ^ Export only the headline, discard the contents + +-- | Export settings <http://orgmode.org/manual/Export-settings.html> +-- These settings can be changed via OPTIONS statements. +data ExportSettings = ExportSettings + { exportArchivedTrees :: ArchivedTreesOption -- ^ How to treat archived trees + , exportDrawers :: Either [String] [String] + -- ^ Specify drawer names which should be exported. @Left@ names are + -- explicitly excluded from the resulting output while @Right@ means that + -- only the listed drawer names should be included. + , exportEmphasizedText :: Bool -- ^ Parse emphasized text + , exportHeadlineLevels :: Int + -- ^ Maximum depth of headlines, deeper headlines are convert to list + , exportSmartQuotes :: Bool -- ^ Parse quotes smartly + , exportSpecialStrings :: Bool -- ^ Parse ellipses and dashes smartly + , exportSubSuperscripts :: Bool -- ^ TeX-like syntax for sub- and superscripts + , exportWithAuthor :: Bool -- ^ Include author in final meta-data + , exportWithCreator :: Bool -- ^ Include creator in final meta-data + , exportWithEmail :: Bool -- ^ Include email in final meta-data + , exportWithTodoKeywords :: Bool -- ^ Keep TODO keywords in headers + } + +instance Default ExportSettings where + def = defaultExportSettings + +defaultExportSettings :: ExportSettings +defaultExportSettings = ExportSettings + { exportArchivedTrees = ArchivedTreesHeadlineOnly + , exportDrawers = Left ["LOGBOOK"] + , exportEmphasizedText = True + , exportHeadlineLevels = 3 + , exportSmartQuotes = True + , exportSpecialStrings = True + , exportSubSuperscripts = True + , exportWithAuthor = True + , exportWithCreator = True + , exportWithEmail = True + , exportWithTodoKeywords = True + } + + +-- +-- Parser state reader +-- + +-- | Reader monad wrapping the parser state. This is used to delay evaluation +-- until all relevant information has been parsed and made available in the +-- parser state. See also the newtype of the same name in +-- Text.Pandoc.Parsing. +newtype F a = F { unF :: Reader OrgParserState a + } deriving (Functor, Applicative, Monad) + +instance Monoid a => Monoid (F a) where + mempty = return mempty + mappend = liftM2 mappend + mconcat = fmap mconcat . sequence + +runF :: F a -> OrgParserState -> a +runF = runReader . unF + +askF :: F OrgParserState +askF = F ask + +asksF :: (OrgParserState -> a) -> F a +asksF f = F $ asks f + +trimInlinesF :: F Inlines -> F Inlines +trimInlinesF = liftM trimInlines + +returnF :: Monad m => a -> m (F a) +returnF = return . return diff --git a/src/Text/Pandoc/Readers/Org/Parsing.hs b/src/Text/Pandoc/Readers/Org/Parsing.hs new file mode 100644 index 000000000..1eb8a3b00 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/Parsing.hs @@ -0,0 +1,217 @@ +{- +Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +Org-mode parsing utilities. + +Most functions are simply re-exports from @Text.Pandoc.Parsing@, some +functions are adapted to Org-mode specific functionality. +-} +module Text.Pandoc.Readers.Org.Parsing + ( OrgParser + , anyLine + , blanklines + , newline + , parseFromString + , skipSpaces1 + , inList + , withContext + , getExportSetting + , updateLastForbiddenCharPos + , updateLastPreCharPos + , orgArgKey + , orgArgWord + , orgArgWordChar + -- * Re-exports from Text.Pandoc.Parser + , ParserContext (..) + , many1Till + , notFollowedBy' + , spaceChar + , nonspaceChar + , skipSpaces + , blankline + , enclosed + , stringAnyCase + , charsInBalanced + , uri + , withRaw + , readWithM + , guardEnabled + , updateLastStrPos + , notAfterString + , ParserState (..) + , registerHeader + , QuoteContext (..) + , singleQuoteStart + , singleQuoteEnd + , doubleQuoteStart + , doubleQuoteEnd + , dash + , ellipses + , citeKey + -- * Re-exports from Text.Pandoc.Parsec + , runParser + , runParserT + , getInput + , char + , letter + , digit + , alphaNum + , skipMany1 + , spaces + , anyChar + , satisfy + , string + , count + , eof + , noneOf + , oneOf + , lookAhead + , notFollowedBy + , many + , many1 + , manyTill + , (<|>) + , (<?>) + , choice + , try + , sepBy + , sepBy1 + , sepEndBy1 + , option + , optional + , optionMaybe + , getState + , updateState + , SourcePos + , getPosition + ) where + +import Text.Pandoc.Readers.Org.ParserState + +import qualified Text.Pandoc.Parsing as P +import Text.Pandoc.Parsing hiding ( anyLine, blanklines, newline + , parseFromString ) + +import Control.Monad ( guard ) +import Control.Monad.Reader ( ReaderT ) + +-- | The parser used to read org files. +type OrgParser m = ParserT [Char] OrgParserState (ReaderT OrgParserLocal m) + +-- +-- Adaptions and specializations of parsing utilities +-- + +-- | Parse any line of text +anyLine :: Monad m => OrgParser m String +anyLine = + P.anyLine + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + +-- The version Text.Pandoc.Parsing cannot be used, as we need additional parts +-- of the state saved and restored. +parseFromString :: Monad m => OrgParser m a -> String -> OrgParser m a +parseFromString parser str' = do + oldLastPreCharPos <- orgStateLastPreCharPos <$> getState + updateState $ \s -> s{ orgStateLastPreCharPos = Nothing } + result <- P.parseFromString parser str' + updateState $ \s -> s{ orgStateLastPreCharPos = oldLastPreCharPos } + return result + +-- | Skip one or more tab or space characters. +skipSpaces1 :: Monad m => OrgParser m () +skipSpaces1 = skipMany1 spaceChar + +-- | Like @Text.Parsec.Char.newline@, but causes additional state changes. +newline :: Monad m => OrgParser m Char +newline = + P.newline + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + +-- | Like @Text.Parsec.Char.blanklines@, but causes additional state changes. +blanklines :: Monad m => OrgParser m [Char] +blanklines = + P.blanklines + <* updateLastPreCharPos + <* updateLastForbiddenCharPos + +-- | Succeeds when we're in list context. +inList :: Monad m => OrgParser m () +inList = do + ctx <- orgStateParserContext <$> getState + guard (ctx == ListItemState) + +-- | Parse in different context +withContext :: Monad m + => ParserContext -- ^ New parser context + -> OrgParser m a -- ^ Parser to run in that context + -> OrgParser m a +withContext context parser = do + oldContext <- orgStateParserContext <$> getState + updateState $ \s -> s{ orgStateParserContext = context } + result <- parser + updateState $ \s -> s{ orgStateParserContext = oldContext } + return result + +-- +-- Parser state functions +-- + +-- | Get an export setting. +getExportSetting :: Monad m => (ExportSettings -> a) -> OrgParser m a +getExportSetting s = s . orgStateExportSettings <$> getState + +-- | Set the current position as the last position at which a forbidden char +-- was found (i.e. a character which is not allowed at the inner border of +-- markup). +updateLastForbiddenCharPos :: Monad m => OrgParser m () +updateLastForbiddenCharPos = getPosition >>= \p -> + updateState $ \s -> s{ orgStateLastForbiddenCharPos = Just p} + +-- | Set the current parser position as the position at which a character was +-- seen which allows inline markup to follow. +updateLastPreCharPos :: Monad m => OrgParser m () +updateLastPreCharPos = getPosition >>= \p -> + updateState $ \s -> s{ orgStateLastPreCharPos = Just p} + +-- +-- Org key-value parsing +-- + +-- | Read the key of a plist style key-value list. +orgArgKey :: Monad m => OrgParser m String +orgArgKey = try $ + skipSpaces *> char ':' + *> many1 orgArgWordChar + +-- | Read the value of a plist style key-value list. +orgArgWord :: Monad m => OrgParser m String +orgArgWord = many1 orgArgWordChar + +-- | Chars treated as part of a word in plists. +orgArgWordChar :: Monad m => OrgParser m Char +orgArgWordChar = alphaNum <|> oneOf "-_" diff --git a/src/Text/Pandoc/Readers/Org/Shared.hs b/src/Text/Pandoc/Readers/Org/Shared.hs new file mode 100644 index 000000000..8c87cfa25 --- /dev/null +++ b/src/Text/Pandoc/Readers/Org/Shared.hs @@ -0,0 +1,97 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2014-2016 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Org.Options + Copyright : Copyright (C) 2014-2016 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + +Utility functions used in other Pandoc Org modules. +-} +module Text.Pandoc.Readers.Org.Shared + ( cleanLinkString + , isImageFilename + , rundocBlockClass + , toRundocAttrib + , translateLang + ) where + +import Control.Arrow ( first ) +import Data.Char ( isAlphaNum ) +import Data.List ( isPrefixOf, isSuffixOf ) + + +-- | Check whether the given string looks like the path to of URL of an image. +isImageFilename :: String -> Bool +isImageFilename filename = + any (\x -> ('.':x) `isSuffixOf` filename) imageExtensions && + (any (\x -> (x ++ "://") `isPrefixOf` filename) protocols || + ':' `notElem` filename) + where + imageExtensions = [ "jpeg" , "jpg" , "png" , "gif" , "svg" ] + protocols = [ "file", "http", "https" ] + +-- | Cleanup and canonicalize a string describing a link. Return @Nothing@ if +-- the string does not appear to be a link. +cleanLinkString :: String -> Maybe String +cleanLinkString s = + case s of + '/':_ -> Just $ "file://" ++ s -- absolute path + '.':'/':_ -> Just s -- relative path + '.':'.':'/':_ -> Just s -- relative path + -- Relative path or URL (file schema) + 'f':'i':'l':'e':':':s' -> Just $ if ("//" `isPrefixOf` s') then s else s' + _ | isUrl s -> Just s -- URL + _ -> Nothing + where + isUrl :: String -> Bool + isUrl cs = + let (scheme, path) = break (== ':') cs + in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme + && not (null path) + +-- | Prefix used for Rundoc classes and arguments. +rundocPrefix :: String +rundocPrefix = "rundoc-" + +-- | The class-name used to mark rundoc blocks. +rundocBlockClass :: String +rundocBlockClass = rundocPrefix ++ "block" + +-- | Prefix the name of a attribute, marking it as a code execution parameter. +toRundocAttrib :: (String, String) -> (String, String) +toRundocAttrib = first (rundocPrefix ++) + +-- | Translate from Org-mode's programming language identifiers to those used +-- by Pandoc. This is useful to allow for proper syntax highlighting in +-- Pandoc output. +translateLang :: String -> String +translateLang cs = + case cs of + "C" -> "c" + "C++" -> "cpp" + "emacs-lisp" -> "commonlisp" -- emacs lisp is not supported + "js" -> "javascript" + "lisp" -> "commonlisp" + "R" -> "r" + "sh" -> "bash" + "sqlite" -> "sql" + _ -> cs diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs new file mode 100644 index 000000000..441c573d9 --- /dev/null +++ b/src/Text/Pandoc/Readers/RST.hs @@ -0,0 +1,1354 @@ +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE FlexibleContexts #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.RST + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion from reStructuredText to 'Pandoc' document. +-} +module Text.Pandoc.Readers.RST ( readRST ) where +import Text.Pandoc.Definition +import Text.Pandoc.Builder (setMeta, fromList) +import Text.Pandoc.Shared +import Text.Pandoc.Parsing +import Text.Pandoc.Options +import Text.Pandoc.Logging +import Text.Pandoc.Error +import Control.Monad ( when, liftM, guard, mzero ) +import Data.List ( findIndex, intercalate, isInfixOf, + transpose, sort, deleteFirstsBy, isSuffixOf , nub, union) +import Data.Maybe (fromMaybe, isJust) +import qualified Data.Map as M +import Text.Printf ( printf ) +import Text.Pandoc.Builder (Inlines, Blocks, trimInlines) +import qualified Text.Pandoc.Builder as B +import Data.Sequence (viewr, ViewR(..)) +import Data.Char (toLower, isHexDigit, isSpace, toUpper) +import Data.Monoid ((<>)) +import Control.Monad.Except (throwError) +import Text.Pandoc.Class (PandocMonad, readFileFromDirs) + +-- TODO: +-- [ ] .. parsed-literal +-- [ ] :widths: attribute in .. table +-- [ ] .. csv-table +-- [ ] .. list-table + +-- | Parse reStructuredText string and return Pandoc document. +readRST :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> m Pandoc +readRST opts s = do + parsed <- (readWithM parseRST) def{ stateOptions = opts } (s ++ "\n\n") + case parsed of + Right result -> return result + Left e -> throwError e + +type RSTParser m = ParserT [Char] ParserState m + +-- +-- Constants and data structure definitions +--- + +bulletListMarkers :: [Char] +bulletListMarkers = "*+-" + +underlineChars :: [Char] +underlineChars = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" + +-- treat these as potentially non-text when parsing inline: +specialChars :: [Char] +specialChars = "\\`|*_<>$:/[]{}()-.\"'\8216\8217\8220\8221" + +-- +-- parsing documents +-- + +isHeader :: Int -> Block -> Bool +isHeader n (Header x _ _) = x == n +isHeader _ _ = False + +-- | Promote all headers in a list of blocks. (Part of +-- title transformation for RST.) +promoteHeaders :: Int -> [Block] -> [Block] +promoteHeaders num ((Header level attr text):rest) = + (Header (level - num) attr text):(promoteHeaders num rest) +promoteHeaders num (other:rest) = other:(promoteHeaders num rest) +promoteHeaders _ [] = [] + +-- | If list of blocks starts with a header (or a header and subheader) +-- of level that are not found elsewhere, return it as a title and +-- promote all the other headers. Also process a definition list right +-- after the title block as metadata. +titleTransform :: ([Block], Meta) -- ^ list of blocks, metadata + -> ([Block], Meta) -- ^ modified list of blocks, metadata +titleTransform (bs, meta) = + let (bs', meta') = + case bs of + ((Header 1 _ head1):(Header 2 _ head2):rest) + | not (any (isHeader 1) rest || any (isHeader 2) rest) -> -- tit/sub + (promoteHeaders 2 rest, setMeta "title" (fromList head1) $ + setMeta "subtitle" (fromList head2) meta) + ((Header 1 _ head1):rest) + | not (any (isHeader 1) rest) -> -- title only + (promoteHeaders 1 rest, + setMeta "title" (fromList head1) meta) + _ -> (bs, meta) + in case bs' of + (DefinitionList ds : rest) -> + (rest, metaFromDefList ds meta') + _ -> (bs', meta') + +metaFromDefList :: [([Inline], [[Block]])] -> Meta -> Meta +metaFromDefList ds meta = adjustAuthors $ foldr f meta ds + where f (k,v) = setMeta (map toLower $ stringify k) (mconcat $ map fromList v) + adjustAuthors (Meta metamap) = Meta $ M.adjust splitAuthors "author" + $ M.adjust toPlain "date" + $ M.adjust toPlain "title" + $ M.mapKeys (\k -> if k == "authors" then "author" else k) + $ metamap + toPlain (MetaBlocks [Para xs]) = MetaInlines xs + toPlain x = x + splitAuthors (MetaBlocks [Para xs]) + = MetaList $ map MetaInlines + $ splitAuthors' xs + splitAuthors x = x + splitAuthors' = map normalizeSpaces . + splitOnSemi . concatMap factorSemi + splitOnSemi = splitBy (==Str ";") + factorSemi (Str []) = [] + factorSemi (Str s) = case break (==';') s of + (xs,[]) -> [Str xs] + (xs,';':ys) -> Str xs : Str ";" : + factorSemi (Str ys) + (xs,ys) -> Str xs : + factorSemi (Str ys) + factorSemi x = [x] + +parseRST :: PandocMonad m => RSTParser m Pandoc +parseRST = do + optional blanklines -- skip blank lines at beginning of file + startPos <- getPosition + -- go through once just to get list of reference keys and notes + -- docMinusKeys is the raw document with blanks where the keys were... + docMinusKeys <- concat <$> + manyTill (referenceKey <|> noteBlock <|> lineClump) eof + setInput docMinusKeys + setPosition startPos + st' <- getState + let reversedNotes = stateNotes st' + updateState $ \s -> s { stateNotes = reverse reversedNotes } + -- now parse it for real... + blocks <- B.toList <$> parseBlocks + standalone <- getOption readerStandalone + state <- getState + let meta = stateMeta state + let (blocks', meta') = if standalone + then titleTransform (blocks, meta) + else (blocks, meta) + reportLogMessages + return $ Pandoc meta' blocks' + +-- +-- parsing blocks +-- + +parseBlocks :: PandocMonad m => RSTParser m Blocks +parseBlocks = mconcat <$> manyTill block eof + +block :: PandocMonad m => RSTParser m Blocks +block = choice [ codeBlock + , blockQuote + , fieldList + , include + , directive + , comment + , header + , hrule + , lineBlock -- must go before definitionList + , table + , list + , lhsCodeBlock + , para + , mempty <$ blanklines + ] <?> "block" + +-- +-- field list +-- + +rawFieldListItem :: Monad m => Int -> RSTParser m (String, String) +rawFieldListItem minIndent = try $ do + indent <- length <$> many (char ' ') + guard $ indent >= minIndent + char ':' + name <- many1Till (noneOf "\n") (char ':') + (() <$ lookAhead newline) <|> skipMany1 spaceChar + first <- anyLine + rest <- option "" $ try $ do lookAhead (count indent (char ' ') >> spaceChar) + indentedBlock + let raw = (if null first then "" else (first ++ "\n")) ++ rest ++ "\n" + return (name, raw) + +fieldListItem :: PandocMonad m => Int -> RSTParser m (Inlines, [Blocks]) +fieldListItem minIndent = try $ do + (name, raw) <- rawFieldListItem minIndent + term <- parseInlineFromString name + contents <- parseFromString parseBlocks raw + optional blanklines + return (term, [contents]) + +fieldList :: PandocMonad m => RSTParser m Blocks +fieldList = try $ do + indent <- length <$> lookAhead (many spaceChar) + items <- many1 $ fieldListItem indent + case items of + [] -> return mempty + items' -> return $ B.definitionList items' + +-- +-- line block +-- + +lineBlock :: PandocMonad m => RSTParser m Blocks +lineBlock = try $ do + lines' <- lineBlockLines + lines'' <- mapM parseInlineFromString lines' + return $ B.lineBlock lines'' + +lineBlockDirective :: PandocMonad m => String -> RSTParser m Blocks +lineBlockDirective body = do + lines' <- mapM parseInlineFromString $ lines $ stripTrailingNewlines body + return $ B.lineBlock lines' + +-- +-- paragraph block +-- + +-- note: paragraph can end in a :: starting a code block +para :: PandocMonad m => RSTParser m Blocks +para = try $ do + result <- trimInlines . mconcat <$> many1 inline + option (B.plain result) $ try $ do + newline + blanklines + case viewr (B.unMany result) of + ys :> (Str xs) | "::" `isSuffixOf` xs -> do + raw <- option mempty codeBlockBody + return $ B.para (B.Many ys <> B.str (take (length xs - 1) xs)) + <> raw + _ -> return (B.para result) + +plain :: PandocMonad m => RSTParser m Blocks +plain = B.plain . trimInlines . mconcat <$> many1 inline + +-- +-- header blocks +-- + +header :: PandocMonad m => RSTParser m Blocks +header = doubleHeader <|> singleHeader <?> "header" + +-- a header with lines on top and bottom +doubleHeader :: PandocMonad m => RSTParser m Blocks +doubleHeader = try $ do + c <- oneOf underlineChars + rest <- many (char c) -- the top line + let lenTop = length (c:rest) + skipSpaces + newline + txt <- trimInlines . mconcat <$> many1 (notFollowedBy blankline >> inline) + pos <- getPosition + let len = (sourceColumn pos) - 1 + if (len > lenTop) then fail "title longer than border" else return () + blankline -- spaces and newline + count lenTop (char c) -- the bottom line + blanklines + -- check to see if we've had this kind of header before. + -- if so, get appropriate level. if not, add to list. + state <- getState + let headerTable = stateHeaderTable state + let (headerTable',level) = case findIndex (== DoubleHeader c) headerTable of + Just ind -> (headerTable, ind + 1) + Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1) + setState (state { stateHeaderTable = headerTable' }) + attr <- registerHeader nullAttr txt + return $ B.headerWith attr level txt + +-- a header with line on the bottom only +singleHeader :: PandocMonad m => RSTParser m Blocks +singleHeader = try $ do + notFollowedBy' whitespace + txt <- trimInlines . mconcat <$> many1 (do {notFollowedBy blankline; inline}) + pos <- getPosition + let len = (sourceColumn pos) - 1 + blankline + c <- oneOf underlineChars + count (len - 1) (char c) + many (char c) + blanklines + state <- getState + let headerTable = stateHeaderTable state + let (headerTable',level) = case findIndex (== SingleHeader c) headerTable of + Just ind -> (headerTable, ind + 1) + Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1) + setState (state { stateHeaderTable = headerTable' }) + attr <- registerHeader nullAttr txt + return $ B.headerWith attr level txt + +-- +-- hrule block +-- + +hrule :: Monad m => ParserT [Char] st m Blocks +hrule = try $ do + chr <- oneOf underlineChars + count 3 (char chr) + skipMany (char chr) + blankline + blanklines + return B.horizontalRule + +-- +-- code blocks +-- + +-- read a line indented by a given string +indentedLine :: Monad m => String -> ParserT [Char] st m [Char] +indentedLine indents = try $ do + string indents + anyLine + +-- one or more indented lines, possibly separated by blank lines. +-- any amount of indentation will work. +indentedBlock :: Monad m => ParserT [Char] st m [Char] +indentedBlock = try $ do + indents <- lookAhead $ many1 spaceChar + lns <- many1 $ try $ do b <- option "" blanklines + l <- indentedLine indents + return (b ++ l) + optional blanklines + return $ unlines lns + +quotedBlock :: Monad m => ParserT [Char] st m [Char] +quotedBlock = try $ do + quote <- lookAhead $ oneOf "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" + lns <- many1 $ lookAhead (char quote) >> anyLine + optional blanklines + return $ unlines lns + +codeBlockStart :: Monad m => ParserT [Char] st m Char +codeBlockStart = string "::" >> blankline >> blankline + +codeBlock :: Monad m => ParserT [Char] st m Blocks +codeBlock = try $ codeBlockStart >> codeBlockBody + +codeBlockBody :: Monad m => ParserT [Char] st m Blocks +codeBlockBody = try $ B.codeBlock . stripTrailingNewlines <$> + (indentedBlock <|> quotedBlock) + +lhsCodeBlock :: Monad m => RSTParser m Blocks +lhsCodeBlock = try $ do + getPosition >>= guard . (==1) . sourceColumn + guardEnabled Ext_literate_haskell + optional codeBlockStart + lns <- latexCodeBlock <|> birdCodeBlock + blanklines + return $ B.codeBlockWith ("", ["sourceCode", "literate", "haskell"], []) + $ intercalate "\n" lns + +latexCodeBlock :: Monad m => ParserT [Char] st m [[Char]] +latexCodeBlock = try $ do + try (latexBlockLine "\\begin{code}") + many1Till anyLine (try $ latexBlockLine "\\end{code}") + where + latexBlockLine s = skipMany spaceChar >> string s >> blankline + +birdCodeBlock :: Monad m => ParserT [Char] st m [[Char]] +birdCodeBlock = filterSpace <$> many1 birdTrackLine + where filterSpace lns = + -- if (as is normal) there is always a space after >, drop it + if all (\ln -> null ln || take 1 ln == " ") lns + then map (drop 1) lns + else lns + +birdTrackLine :: Monad m => ParserT [Char] st m [Char] +birdTrackLine = char '>' >> anyLine + +-- +-- block quotes +-- + +blockQuote :: PandocMonad m => RSTParser m Blocks +blockQuote = do + raw <- indentedBlock + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ raw ++ "\n\n" + return $ B.blockQuote contents + +{- +Unsupported options for include: +tab-width +encoding +-} + +include :: PandocMonad m => RSTParser m Blocks +include = try $ do + string ".. include::" + skipMany spaceChar + f <- trim <$> anyLine + fields <- many $ rawFieldListItem 3 + -- options + let (startLine :: Maybe Int) = lookup "start-line" fields >>= safeRead + let (endLine :: Maybe Int) = lookup "end-line" fields >>= safeRead + guard $ not (null f) + oldPos <- getPosition + oldInput <- getInput + containers <- stateContainers <$> getState + when (f `elem` containers) $ + throwError $ PandocParseError $ "Include file loop at " ++ show oldPos + updateState $ \s -> s{ stateContainers = f : stateContainers s } + mbContents <- readFileFromDirs ["."] f + contentLines <- case mbContents of + Just s -> return $ lines s + Nothing -> do + logMessage $ CouldNotLoadIncludeFile f oldPos + return [] + let numLines = length contentLines + let startLine' = case startLine of + Nothing -> 1 + Just x | x >= 0 -> x + | otherwise -> numLines + x -- negative from end + let endLine' = case endLine of + Nothing -> numLines + 1 + Just x | x >= 0 -> x + | otherwise -> numLines + x -- negative from end + let contentLines' = drop (startLine' - 1) + $ take (endLine' - 1) + $ contentLines + let contentLines'' = (case trim <$> lookup "end-before" fields of + Just patt -> takeWhile (not . (patt `isInfixOf`)) + Nothing -> id) . + (case trim <$> lookup "start-after" fields of + Just patt -> drop 1 . + dropWhile (not . (patt `isInfixOf`)) + Nothing -> id) $ contentLines' + let contents' = unlines contentLines'' + case lookup "code" fields of + Just lang -> do + let numberLines = lookup "number-lines" fields + let classes = trimr lang : ["numberLines" | isJust numberLines] ++ + maybe [] words (lookup "class" fields) + let kvs = maybe [] (\n -> [("startFrom", trimr n)]) numberLines + let ident = maybe "" trimr $ lookup "name" fields + let attribs = (ident, classes, kvs) + return $ B.codeBlockWith attribs contents' + Nothing -> case lookup "literal" fields of + Just _ -> return $ B.rawBlock "rst" contents' + Nothing -> do + setPosition $ newPos f 1 1 + setInput contents' + bs <- optional blanklines >> + (mconcat <$> many block) + setInput oldInput + setPosition oldPos + updateState $ \s -> s{ stateContainers = + tail $ stateContainers s } + return bs + + +-- +-- list blocks +-- + +list :: PandocMonad m => RSTParser m Blocks +list = choice [ bulletList, orderedList, definitionList ] <?> "list" + +definitionListItem :: PandocMonad m => RSTParser m (Inlines, [Blocks]) +definitionListItem = try $ do + -- avoid capturing a directive or comment + notFollowedBy (try $ char '.' >> char '.') + term <- trimInlines . mconcat <$> many1Till inline endline + raw <- indentedBlock + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ raw ++ "\n" + return (term, [contents]) + +definitionList :: PandocMonad m => RSTParser m Blocks +definitionList = B.definitionList <$> many1 definitionListItem + +-- parses bullet list start and returns its length (inc. following whitespace) +bulletListStart :: Monad m => ParserT [Char] st m Int +bulletListStart = try $ do + notFollowedBy' hrule -- because hrules start out just like lists + marker <- oneOf bulletListMarkers + white <- many1 spaceChar + return $ length (marker:white) + +-- parses ordered list start and returns its length (inc following whitespace) +orderedListStart :: Monad m => ListNumberStyle + -> ListNumberDelim + -> RSTParser m Int +orderedListStart style delim = try $ do + (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim) + white <- many1 spaceChar + return $ markerLen + length white + +-- parse a line of a list item +listLine :: Monad m => Int -> RSTParser m [Char] +listLine markerLength = try $ do + notFollowedBy blankline + indentWith markerLength + line <- anyLine + return $ line ++ "\n" + +-- indent by specified number of spaces (or equiv. tabs) +indentWith :: Monad m => Int -> RSTParser m [Char] +indentWith num = do + tabStop <- getOption readerTabStop + if (num < tabStop) + then count num (char ' ') + else choice [ try (count num (char ' ')), + (try (char '\t' >> count (num - tabStop) (char ' '))) ] + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem :: Monad m => RSTParser m Int + -> RSTParser m (Int, [Char]) +rawListItem start = try $ do + markerLength <- start + firstLine <- anyLine + restLines <- many (listLine markerLength) + return (markerLength, (firstLine ++ "\n" ++ (concat restLines))) + +-- continuation of a list item - indented and separated by blankline or +-- (in compact lists) endline. +-- Note: nested lists are parsed as continuations. +listContinuation :: Monad m => Int -> RSTParser m [Char] +listContinuation markerLength = try $ do + blanks <- many1 blankline + result <- many1 (listLine markerLength) + return $ blanks ++ concat result + +listItem :: PandocMonad m + => RSTParser m Int + -> RSTParser m Blocks +listItem start = try $ do + (markerLength, first) <- rawListItem start + rest <- many (listContinuation markerLength) + skipMany1 blankline <|> () <$ lookAhead start + -- parsing with ListItemState forces markers at beginning of lines to + -- count as list item markers, even if not separated by blank space. + -- see definition of "endline" + state <- getState + let oldContext = stateParserContext state + setState $ state {stateParserContext = ListItemState} + -- parse the extracted block, which may itself contain block elements + parsed <- parseFromString parseBlocks $ concat (first:rest) ++ "\n" + updateState (\st -> st {stateParserContext = oldContext}) + return $ case B.toList parsed of + [Para xs] -> B.singleton $ Plain xs + [Para xs, BulletList ys] -> B.fromList [Plain xs, BulletList ys] + [Para xs, OrderedList s ys] -> B.fromList [Plain xs, OrderedList s ys] + [Para xs, DefinitionList ys] -> B.fromList [Plain xs, DefinitionList ys] + _ -> parsed + +orderedList :: PandocMonad m => RSTParser m Blocks +orderedList = try $ do + (start, style, delim) <- lookAhead (anyOrderedListMarker <* spaceChar) + items <- many1 (listItem (orderedListStart style delim)) + let items' = compactify items + return $ B.orderedListWith (start, style, delim) items' + +bulletList :: PandocMonad m => RSTParser m Blocks +bulletList = B.bulletList . compactify <$> many1 (listItem bulletListStart) + +-- +-- directive (e.g. comment, container, compound-paragraph) +-- + +comment :: Monad m => RSTParser m Blocks +comment = try $ do + string ".." + skipMany1 spaceChar <|> (() <$ lookAhead newline) + notFollowedBy' directiveLabel + manyTill anyChar blanklines + optional indentedBlock + return mempty + +directiveLabel :: Monad m => RSTParser m String +directiveLabel = map toLower + <$> many1Till (letter <|> char '-') (try $ string "::") + +directive :: PandocMonad m => RSTParser m Blocks +directive = try $ do + string ".." + directive' + +directive' :: PandocMonad m => RSTParser m Blocks +directive' = do + skipMany1 spaceChar + label <- directiveLabel + skipMany spaceChar + top <- many $ satisfy (/='\n') + <|> try (char '\n' <* + notFollowedBy' (rawFieldListItem 3) <* + count 3 (char ' ') <* + notFollowedBy blankline) + newline + fields <- many $ rawFieldListItem 3 + body <- option "" $ try $ blanklines >> indentedBlock + optional blanklines + let body' = body ++ "\n\n" + imgAttr cl = ("", classes, getAtt "width" ++ getAtt "height") + where + classes = words $ maybe "" trim $ lookup cl fields + getAtt k = case lookup k fields of + Just v -> [(k, filter (not . isSpace) v)] + Nothing -> [] + case label of + "table" -> tableDirective top fields body' + "line-block" -> lineBlockDirective body' + "raw" -> return $ B.rawBlock (trim top) (stripTrailingNewlines body) + "role" -> addNewRole top $ map (\(k,v) -> (k, trim v)) fields + "container" -> parseFromString parseBlocks body' + "replace" -> B.para <$> -- consumed by substKey + parseInlineFromString (trim top) + "unicode" -> B.para <$> -- consumed by substKey + parseInlineFromString (trim $ unicodeTransform top) + "compound" -> parseFromString parseBlocks body' + "pull-quote" -> B.blockQuote <$> parseFromString parseBlocks body' + "epigraph" -> B.blockQuote <$> parseFromString parseBlocks body' + "highlights" -> B.blockQuote <$> parseFromString parseBlocks body' + "rubric" -> B.para . B.strong <$> parseInlineFromString top + _ | label `elem` ["attention","caution","danger","error","hint", + "important","note","tip","warning","admonition"] -> + do bod <- parseFromString parseBlocks $ top ++ "\n\n" ++ body' + let lab = case label of + "admonition" -> mempty + (l:ls) -> B.divWith ("",["admonition-title"],[]) + (B.para (B.str (toUpper l : ls))) + [] -> mempty + return $ B.divWith ("",[label],[]) (lab <> bod) + "sidebar" -> + do let subtit = maybe "" trim $ lookup "subtitle" fields + tit <- B.para . B.strong <$> parseInlineFromString + (trim top ++ if null subtit + then "" + else (": " ++ subtit)) + bod <- parseFromString parseBlocks body' + return $ B.divWith ("",["sidebar"],[]) $ tit <> bod + "topic" -> + do tit <- B.para . B.strong <$> parseInlineFromString top + bod <- parseFromString parseBlocks body' + return $ B.divWith ("",["topic"],[]) $ tit <> bod + "default-role" -> mempty <$ updateState (\s -> + s { stateRstDefaultRole = + case trim top of + "" -> stateRstDefaultRole def + role -> role }) + x | x == "code" || x == "code-block" -> + codeblock (words $ fromMaybe [] $ lookup "class" fields) + (lookup "number-lines" fields) (trim top) body + "aafig" -> do + let attribs = ("", ["aafig"], map (\(k,v) -> (k, trimr v)) fields) + return $ B.codeBlockWith attribs $ stripTrailingNewlines body + "math" -> return $ B.para $ mconcat $ map B.displayMath + $ toChunks $ top ++ "\n\n" ++ body + "figure" -> do + (caption, legend) <- parseFromString extractCaption body' + let src = escapeURI $ trim top + return $ B.para (B.imageWith (imgAttr "figclass") src "fig:" caption) <> legend + "image" -> do + let src = escapeURI $ trim top + let alt = B.str $ maybe "image" trim $ lookup "alt" fields + let attr = imgAttr "class" + return $ B.para + $ case lookup "target" fields of + Just t -> B.link (escapeURI $ trim t) "" + $ B.imageWith attr src "" alt + Nothing -> B.imageWith attr src "" alt + "class" -> do + let attrs = ("", (splitBy isSpace $ trim top), map (\(k,v) -> (k, trimr v)) fields) + -- directive content or the first immediately following element + children <- case body of + "" -> block + _ -> parseFromString parseBlocks body' + return $ B.divWith attrs children + other -> do + pos <- getPosition + logMessage $ SkippedContent (".. " ++ other) pos + return mempty + +tableDirective :: PandocMonad m + => String -> [(String, String)] -> String -> RSTParser m Blocks +tableDirective top _fields body = do + bs <- parseFromString parseBlocks body + case B.toList bs of + [Table _ aligns' widths' header' rows'] -> do + title <- parseFromString (trimInlines . mconcat <$> many inline) top + -- TODO widths + -- align is not applicable since we can't represent whole table align + return $ B.singleton $ Table (B.toList title) + aligns' widths' header' rows' + _ -> return mempty + +-- TODO: +-- - Only supports :format: fields with a single format for :raw: roles, +-- change Text.Pandoc.Definition.Format to fix +addNewRole :: PandocMonad m => String -> [(String, String)] -> RSTParser m Blocks +addNewRole roleString fields = do + pos <- getPosition + (role, parentRole) <- parseFromString inheritedRole roleString + customRoles <- stateRstCustomRoles <$> getState + let getBaseRole (r, f, a) roles = + case M.lookup r roles of + Just (r', f', a') -> getBaseRole (r', f', a') roles + Nothing -> (r, f, a) + (baseRole, baseFmt, baseAttr) = + getBaseRole (parentRole, Nothing, nullAttr) customRoles + fmt = if parentRole == "raw" then lookup "format" fields else baseFmt + annotate :: [String] -> [String] + annotate = maybe id (:) $ + if baseRole == "code" + then lookup "language" fields + else Nothing + attr = let (ident, classes, keyValues) = baseAttr + -- nub in case role name & language class are the same + in (ident, nub . (role :) . annotate $ classes, keyValues) + + -- warn about syntax we ignore + flip mapM_ fields $ \(key, _) -> case key of + "language" -> when (baseRole /= "code") $ logMessage $ + SkippedContent ":language: [because parent of role is not :code:]" + pos + "format" -> when (baseRole /= "raw") $ logMessage $ + SkippedContent ":format: [because parent of role is not :raw:]" pos + _ -> logMessage $ SkippedContent (":" ++ key ++ ":") pos + when (parentRole == "raw" && countKeys "format" > 1) $ + logMessage $ SkippedContent ":format: [after first in definition of role]" + pos + when (parentRole == "code" && countKeys "language" > 1) $ + logMessage $ SkippedContent + ":language: [after first in definition of role]" pos + + updateState $ \s -> s { + stateRstCustomRoles = + M.insert role (baseRole, fmt, attr) customRoles + } + + return mempty + where + countKeys k = length . filter (== k) . map fst $ fields + inheritedRole = + (,) <$> roleName <*> ((char '(' *> roleName <* char ')') <|> pure "span") + + +-- Can contain character codes as decimal numbers or +-- hexadecimal numbers, prefixed by 0x, x, \x, U+, u, or \u +-- or as XML-style hexadecimal character entities, e.g. ᨫ +-- or text, which is used as-is. Comments start with .. +unicodeTransform :: String -> String +unicodeTransform t = + case t of + ('.':'.':xs) -> unicodeTransform $ dropWhile (/='\n') xs -- comment + ('0':'x':xs) -> go "0x" xs + ('x':xs) -> go "x" xs + ('\\':'x':xs) -> go "\\x" xs + ('U':'+':xs) -> go "U+" xs + ('u':xs) -> go "u" xs + ('\\':'u':xs) -> go "\\u" xs + ('&':'#':'x':xs) -> maybe ("&#x" ++ unicodeTransform xs) + -- drop semicolon + (\(c,s) -> c : unicodeTransform (drop 1 s)) + $ extractUnicodeChar xs + (x:xs) -> x : unicodeTransform xs + [] -> [] + where go pref zs = maybe (pref ++ unicodeTransform zs) + (\(c,s) -> c : unicodeTransform s) + $ extractUnicodeChar zs + +extractUnicodeChar :: String -> Maybe (Char, String) +extractUnicodeChar s = maybe Nothing (\c -> Just (c,rest)) mbc + where (ds,rest) = span isHexDigit s + mbc = safeRead ('\'':'\\':'x':ds ++ "'") + +extractCaption :: PandocMonad m => RSTParser m (Inlines, Blocks) +extractCaption = do + capt <- trimInlines . mconcat <$> many inline + legend <- optional blanklines >> (mconcat <$> many block) + return (capt,legend) + +-- divide string by blanklines +toChunks :: String -> [String] +toChunks = dropWhile null + . map (trim . unlines) + . splitBy (all (`elem` (" \t" :: String))) . lines + +codeblock :: [String] -> Maybe String -> String -> String -> RSTParser m Blocks +codeblock classes numberLines lang body = + return $ B.codeBlockWith attribs $ stripTrailingNewlines body + where attribs = ("", classes', kvs) + classes' = "sourceCode" : lang + : maybe [] (\_ -> ["numberLines"]) numberLines + ++ classes + kvs = case numberLines of + Just "" -> [] + Nothing -> [] + Just n -> [("startFrom",trim n)] + +--- +--- note block +--- + +noteBlock :: Monad m => RSTParser m [Char] +noteBlock = try $ do + startPos <- getPosition + string ".." + spaceChar >> skipMany spaceChar + ref <- noteMarker + first <- (spaceChar >> skipMany spaceChar >> anyLine) + <|> (newline >> return "") + blanks <- option "" blanklines + rest <- option "" indentedBlock + endPos <- getPosition + let raw = first ++ "\n" ++ blanks ++ rest ++ "\n" + let newnote = (ref, raw) + st <- getState + let oldnotes = stateNotes st + updateState $ \s -> s { stateNotes = newnote : oldnotes } + -- return blanks so line count isn't affected + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +noteMarker :: Monad m => RSTParser m [Char] +noteMarker = do + char '[' + res <- many1 digit + <|> (try $ char '#' >> liftM ('#':) simpleReferenceName') + <|> count 1 (oneOf "#*") + char ']' + return res + +-- +-- reference key +-- + +quotedReferenceName :: PandocMonad m => RSTParser m Inlines +quotedReferenceName = try $ do + char '`' >> notFollowedBy (char '`') -- `` means inline code! + label' <- trimInlines . mconcat <$> many1Till inline (char '`') + return label' + +unquotedReferenceName :: PandocMonad m => RSTParser m Inlines +unquotedReferenceName = try $ do + label' <- trimInlines . mconcat <$> many1Till inline (lookAhead $ char ':') + return label' + +-- Simple reference names are single words consisting of alphanumerics +-- plus isolated (no two adjacent) internal hyphens, underscores, +-- periods, colons and plus signs; no whitespace or other characters +-- are allowed. +simpleReferenceName' :: Monad m => ParserT [Char] st m String +simpleReferenceName' = do + x <- alphaNum + xs <- many $ alphaNum + <|> (try $ oneOf "-_:+." <* lookAhead alphaNum) + return (x:xs) + +simpleReferenceName :: Monad m => ParserT [Char] st m Inlines +simpleReferenceName = do + raw <- simpleReferenceName' + return $ B.str raw + +referenceName :: PandocMonad m => RSTParser m Inlines +referenceName = quotedReferenceName <|> + (try $ simpleReferenceName <* lookAhead (char ':')) <|> + unquotedReferenceName + +referenceKey :: PandocMonad m => RSTParser m [Char] +referenceKey = do + startPos <- getPosition + choice [substKey, anonymousKey, regularKey] + optional blanklines + endPos <- getPosition + -- return enough blanks to replace key + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +targetURI :: Monad m => ParserT [Char] st m [Char] +targetURI = do + skipSpaces + optional newline + contents <- many1 (try (many spaceChar >> newline >> + many1 spaceChar >> noneOf " \t\n") <|> noneOf "\n") + blanklines + return $ escapeURI $ trim $ contents + +substKey :: PandocMonad m => RSTParser m () +substKey = try $ do + string ".." + skipMany1 spaceChar + (alt,ref) <- withRaw $ trimInlines . mconcat + <$> enclosed (char '|') (char '|') inline + res <- B.toList <$> directive' + il <- case res of + -- use alt unless :alt: attribute on image: + [Para [Image attr [Str "image"] (src,tit)]] -> + return $ B.imageWith attr src tit alt + [Para [Link _ [Image attr [Str "image"] (src,tit)] (src',tit')]] -> + return $ B.link src' tit' (B.imageWith attr src tit alt) + [Para ils] -> return $ B.fromList ils + _ -> mzero + let key = toKey $ stripFirstAndLast ref + updateState $ \s -> s{ stateSubstitutions = M.insert key il $ stateSubstitutions s } + +anonymousKey :: Monad m => RSTParser m () +anonymousKey = try $ do + oneOfStrings [".. __:", "__"] + src <- targetURI + pos <- getPosition + let key = toKey $ "_" ++ printf "%09d" (sourceLine pos) + --TODO: parse width, height, class and name attributes + updateState $ \s -> s { stateKeys = M.insert key ((src,""), nullAttr) $ stateKeys s } + +stripTicks :: String -> String +stripTicks = reverse . stripTick . reverse . stripTick + where stripTick ('`':xs) = xs + stripTick xs = xs + +regularKey :: PandocMonad m => RSTParser m () +regularKey = try $ do + string ".. _" + (_,ref) <- withRaw referenceName + char ':' + src <- targetURI + let key = toKey $ stripTicks ref + --TODO: parse width, height, class and name attributes + updateState $ \s -> s { stateKeys = M.insert key ((src,""), nullAttr) $ stateKeys s } + +-- +-- tables +-- + +-- General tables TODO: +-- - figure out if leading spaces are acceptable and if so, add +-- support for them +-- +-- Simple tables TODO: +-- - column spans +-- - multiline support +-- - ensure that rightmost column span does not need to reach end +-- - require at least 2 columns +-- +-- Grid tables TODO: +-- - column spans + +dashedLine :: Monad m => Char -> ParserT [Char] st m (Int, Int) +dashedLine ch = do + dashes <- many1 (char ch) + sp <- many (char ' ') + return (length dashes, length $ dashes ++ sp) + +simpleDashedLines :: Monad m => Char -> ParserT [Char] st m [(Int,Int)] +simpleDashedLines ch = try $ many1 (dashedLine ch) + +-- Parse a table row separator +simpleTableSep :: Monad m => Char -> RSTParser m Char +simpleTableSep ch = try $ simpleDashedLines ch >> newline + +-- Parse a table footer +simpleTableFooter :: Monad m => RSTParser m [Char] +simpleTableFooter = try $ simpleTableSep '=' >> blanklines + +-- Parse a raw line and split it into chunks by indices. +simpleTableRawLine :: Monad m => [Int] -> RSTParser m [String] +simpleTableRawLine indices = do + line <- many1Till anyChar newline + return (simpleTableSplitLine indices line) + +-- Parse a table row and return a list of blocks (columns). +simpleTableRow :: PandocMonad m => [Int] -> RSTParser m [Blocks] +simpleTableRow indices = do + notFollowedBy' simpleTableFooter + firstLine <- simpleTableRawLine indices + colLines <- return [] -- TODO + let cols = map unlines . transpose $ firstLine : colLines + mapM (parseFromString (mconcat <$> many plain)) cols + +simpleTableSplitLine :: [Int] -> String -> [String] +simpleTableSplitLine indices line = + map trim + $ tail $ splitByIndices (init indices) line + +simpleTableHeader :: PandocMonad m + => Bool -- ^ Headerless table + -> RSTParser m ([Blocks], [Alignment], [Int]) +simpleTableHeader headless = try $ do + optional blanklines + rawContent <- if headless + then return "" + else simpleTableSep '=' >> anyLine + dashes <- simpleDashedLines '=' <|> simpleDashedLines '-' + newline + let lines' = map snd dashes + let indices = scanl (+) 0 lines' + let aligns = replicate (length lines') AlignDefault + let rawHeads = if headless + then replicate (length dashes) "" + else simpleTableSplitLine indices rawContent + heads <- mapM (parseFromString (mconcat <$> many plain)) $ + map trim rawHeads + return (heads, aligns, indices) + +-- Parse a simple table. +simpleTable :: PandocMonad m + => Bool -- ^ Headerless table + -> RSTParser m Blocks +simpleTable headless = do + tbl <- tableWith (simpleTableHeader headless) simpleTableRow + sep simpleTableFooter + -- Simple tables get 0s for relative column widths (i.e., use default) + case B.toList tbl of + [Table c a _w h l] -> return $ B.singleton $ + Table c a (replicate (length a) 0) h l + _ -> + throwError $ PandocShouldNeverHappenError + "tableWith returned something unexpected" + where + sep = return () -- optional (simpleTableSep '-') + +gridTable :: PandocMonad m + => Bool -- ^ Headerless table + -> RSTParser m Blocks +gridTable headerless = gridTableWith parseBlocks headerless + +table :: PandocMonad m => RSTParser m Blocks +table = gridTable False <|> simpleTable False <|> + gridTable True <|> simpleTable True <?> "table" + +-- +-- inline +-- + +inline :: PandocMonad m => RSTParser m Inlines +inline = choice [ note -- can start with whitespace, so try before ws + , whitespace + , link + , str + , endline + , strong + , emph + , code + , subst + , interpretedRole + , smart + , hyphens + , escapedChar + , symbol ] <?> "inline" + +parseInlineFromString :: PandocMonad m => String -> RSTParser m Inlines +parseInlineFromString = parseFromString (trimInlines . mconcat <$> many inline) + +hyphens :: Monad m => RSTParser m Inlines +hyphens = do + result <- many1 (char '-') + optional endline + -- don't want to treat endline after hyphen or dash as a space + return $ B.str result + +escapedChar :: Monad m => ParserT [Char] st m Inlines +escapedChar = do c <- escaped anyChar + return $ if c == ' ' -- '\ ' is null in RST + then mempty + else B.str [c] + +symbol :: Monad m => RSTParser m Inlines +symbol = do + result <- oneOf specialChars + return $ B.str [result] + +-- parses inline code, between codeStart and codeEnd +code :: Monad m => RSTParser m Inlines +code = try $ do + string "``" + result <- manyTill anyChar (try (string "``")) + return $ B.code + $ trim $ unwords $ lines result + +-- succeeds only if we're not right after a str (ie. in middle of word) +atStart :: Monad m => RSTParser m a -> RSTParser m a +atStart p = do + pos <- getPosition + st <- getState + -- single quote start can't be right after str + guard $ stateLastStrPos st /= Just pos + p + +emph :: PandocMonad m => RSTParser m Inlines +emph = B.emph . trimInlines . mconcat <$> + enclosed (atStart $ char '*') (char '*') inline + +strong :: PandocMonad m => RSTParser m Inlines +strong = B.strong . trimInlines . mconcat <$> + enclosed (atStart $ string "**") (try $ string "**") inline + +-- Note, this doesn't precisely implement the complex rule in +-- http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules +-- but it should be good enough for most purposes +-- +-- TODO: +-- - Classes are silently discarded in addNewRole +-- - Lacks sensible implementation for title-reference (which is the default) +-- - Allows direct use of the :raw: role, rST only allows inherited use. +interpretedRole :: PandocMonad m => RSTParser m Inlines +interpretedRole = try $ do + (role, contents) <- roleBefore <|> roleAfter + renderRole contents Nothing role nullAttr + +renderRole :: PandocMonad m => String -> Maybe String -> String -> Attr -> RSTParser m Inlines +renderRole contents fmt role attr = case role of + "sup" -> return $ B.superscript $ B.str contents + "superscript" -> return $ B.superscript $ B.str contents + "sub" -> return $ B.subscript $ B.str contents + "subscript" -> return $ B.subscript $ B.str contents + "emphasis" -> return $ B.emph $ B.str contents + "strong" -> return $ B.strong $ B.str contents + "rfc-reference" -> return $ rfcLink contents + "RFC" -> return $ rfcLink contents + "pep-reference" -> return $ pepLink contents + "PEP" -> return $ pepLink contents + "literal" -> return $ B.codeWith attr contents + "math" -> return $ B.math contents + "title-reference" -> titleRef contents + "title" -> titleRef contents + "t" -> titleRef contents + "code" -> return $ B.codeWith (addClass "sourceCode" attr) contents + "span" -> return $ B.spanWith attr $ B.str contents + "raw" -> return $ B.rawInline (fromMaybe "" fmt) contents + custom -> do + customRoles <- stateRstCustomRoles <$> getState + case M.lookup custom customRoles of + Just (newRole, newFmt, newAttr) -> + renderRole contents newFmt newRole newAttr + Nothing -> do + pos <- getPosition + logMessage $ SkippedContent (":" ++ custom ++ ":") pos + return $ B.str contents -- Undefined role + where + titleRef ref = return $ B.str ref -- FIXME: Not a sensible behaviour + rfcLink rfcNo = B.link rfcUrl ("RFC " ++ rfcNo) $ B.str ("RFC " ++ rfcNo) + where rfcUrl = "http://www.faqs.org/rfcs/rfc" ++ rfcNo ++ ".html" + pepLink pepNo = B.link pepUrl ("PEP " ++ pepNo) $ B.str ("PEP " ++ pepNo) + where padNo = replicate (4 - length pepNo) '0' ++ pepNo + pepUrl = "http://www.python.org/dev/peps/pep-" ++ padNo ++ "/" + +addClass :: String -> Attr -> Attr +addClass c (ident, classes, keyValues) = (ident, union classes [c], keyValues) + +roleName :: PandocMonad m => RSTParser m String +roleName = many1 (letter <|> char '-') + +roleMarker :: PandocMonad m => RSTParser m String +roleMarker = char ':' *> roleName <* char ':' + +roleBefore :: PandocMonad m => RSTParser m (String,String) +roleBefore = try $ do + role <- roleMarker + contents <- unmarkedInterpretedText + return (role,contents) + +roleAfter :: PandocMonad m => RSTParser m (String,String) +roleAfter = try $ do + contents <- unmarkedInterpretedText + role <- roleMarker <|> (stateRstDefaultRole <$> getState) + return (role,contents) + +unmarkedInterpretedText :: PandocMonad m => RSTParser m [Char] +unmarkedInterpretedText = enclosed (atStart $ char '`') (char '`') anyChar + +whitespace :: PandocMonad m => RSTParser m Inlines +whitespace = B.space <$ skipMany1 spaceChar <?> "whitespace" + +str :: Monad m => RSTParser m Inlines +str = do + let strChar = noneOf ("\t\n " ++ specialChars) + result <- many1 strChar + updateLastStrPos + return $ B.str result + +-- an endline character that can be treated as a space, not a structural break +endline :: Monad m => RSTParser m Inlines +endline = try $ do + newline + notFollowedBy blankline + -- parse potential list-starts at beginning of line differently in a list: + st <- getState + if (stateParserContext st) == ListItemState + then notFollowedBy (anyOrderedListMarker >> spaceChar) >> + notFollowedBy' bulletListStart + else return () + return B.softbreak + +-- +-- links +-- + +link :: PandocMonad m => RSTParser m Inlines +link = choice [explicitLink, referenceLink, autoLink] <?> "link" + +explicitLink :: PandocMonad m => RSTParser m Inlines +explicitLink = try $ do + char '`' + notFollowedBy (char '`') -- `` marks start of inline code + label' <- trimInlines . mconcat <$> + manyTill (notFollowedBy (char '`') >> inline) (char '<') + src <- trim <$> manyTill (noneOf ">\n") (char '>') + skipSpaces + string "`_" + optional $ char '_' -- anonymous form + let label'' = if label' == mempty + then B.str src + else label' + -- `link <google_>` is a reference link to _google! + ((src',tit),attr) <- case reverse src of + '_':xs -> lookupKey [] (toKey (reverse xs)) + _ -> return ((src, ""), nullAttr) + return $ B.linkWith attr (escapeURI src') tit label'' + +referenceLink :: PandocMonad m => RSTParser m Inlines +referenceLink = try $ do + (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) <* + char '_' + let isAnonKey (Key ('_':_)) = True + isAnonKey _ = False + state <- getState + let keyTable = stateKeys state + key <- option (toKey $ stripTicks ref) $ + do char '_' + let anonKeys = sort $ filter isAnonKey $ M.keys keyTable + case anonKeys of + [] -> mzero + (k:_) -> return k + ((src,tit), attr) <- lookupKey [] key + -- if anonymous link, remove key so it won't be used again + when (isAnonKey key) $ updateState $ \s -> s{ stateKeys = M.delete key keyTable } + return $ B.linkWith attr src tit label' + +-- We keep a list of oldkeys so we can detect lookup loops. +lookupKey :: PandocMonad m + => [Key] -> Key -> RSTParser m ((String, String), Attr) +lookupKey oldkeys key = do + pos <- getPosition + state <- getState + let keyTable = stateKeys state + case M.lookup key keyTable of + Nothing -> do + let Key key' = key + logMessage $ ReferenceNotFound key' pos + return (("",""),nullAttr) + -- check for keys of the form link_, which need to be resolved: + Just ((u@(_:_),""),_) | last u == '_' -> do + let rawkey = init u + let newkey = toKey rawkey + if newkey `elem` oldkeys + then do + logMessage $ CircularReference rawkey pos + return (("",""),nullAttr) + else lookupKey (key:oldkeys) newkey + Just val -> return val + +autoURI :: Monad m => RSTParser m Inlines +autoURI = do + (orig, src) <- uri + return $ B.link src "" $ B.str orig + +autoEmail :: Monad m => RSTParser m Inlines +autoEmail = do + (orig, src) <- emailAddress + return $ B.link src "" $ B.str orig + +autoLink :: PandocMonad m => RSTParser m Inlines +autoLink = autoURI <|> autoEmail + +subst :: PandocMonad m => RSTParser m Inlines +subst = try $ do + (_,ref) <- withRaw $ enclosed (char '|') (char '|') inline + state <- getState + let substTable = stateSubstitutions state + let key = toKey $ stripFirstAndLast ref + case M.lookup key substTable of + Nothing -> do + pos <- getPosition + logMessage $ ReferenceNotFound (show key) pos + return mempty + Just target -> return target + +note :: PandocMonad m => RSTParser m Inlines +note = try $ do + optional whitespace + ref <- noteMarker + char '_' + state <- getState + let notes = stateNotes state + case lookup ref notes of + Nothing -> do + pos <- getPosition + logMessage $ ReferenceNotFound ref pos + return mempty + Just raw -> do + -- We temporarily empty the note list while parsing the note, + -- so that we don't get infinite loops with notes inside notes... + -- Note references inside other notes are allowed in reST, but + -- not yet in this implementation. + updateState $ \st -> st{ stateNotes = [] } + contents <- parseFromString parseBlocks raw + let newnotes = if (ref == "*" || ref == "#") -- auto-numbered + -- delete the note so the next auto-numbered note + -- doesn't get the same contents: + then deleteFirstsBy (==) notes [(ref,raw)] + else notes + updateState $ \st -> st{ stateNotes = newnotes } + return $ B.note contents + +smart :: PandocMonad m => RSTParser m Inlines +smart = do + guardEnabled Ext_smart + doubleQuoted <|> singleQuoted <|> + choice [apostrophe, dash, ellipses] + +singleQuoted :: PandocMonad m => RSTParser m Inlines +singleQuoted = try $ do + singleQuoteStart + withQuoteContext InSingleQuote $ + B.singleQuoted . trimInlines . mconcat <$> + many1Till inline singleQuoteEnd + +doubleQuoted :: PandocMonad m => RSTParser m Inlines +doubleQuoted = try $ do + doubleQuoteStart + withQuoteContext InDoubleQuote $ + B.doubleQuoted . trimInlines . mconcat <$> + many1Till inline doubleQuoteEnd diff --git a/src/Text/Pandoc/Readers/TWiki.hs b/src/Text/Pandoc/Readers/TWiki.hs new file mode 100644 index 000000000..3b89f2ee9 --- /dev/null +++ b/src/Text/Pandoc/Readers/TWiki.hs @@ -0,0 +1,525 @@ +{-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances, FlexibleContexts #-} +-- RelaxedPolyRec needed for inlinesBetween on GHC < 7 +{- + Copyright (C) 2014 Alexander Sulfrian <alexander.sulfrian@fu-berlin.de> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.TWiki + Copyright : Copyright (C) 2014 Alexander Sulfrian + License : GNU GPL, version 2 or above + + Maintainer : Alexander Sulfrian <alexander.sulfrian@fu-berlin.de> + Stability : alpha + Portability : portable + +Conversion of twiki text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.TWiki ( readTWiki + ) where + +import Text.Pandoc.Definition +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Options +import Text.Pandoc.Logging +import Text.Pandoc.Parsing hiding (enclosed, macro, nested) +import Text.Pandoc.Readers.HTML (htmlTag, isCommentTag) +import Control.Monad +import Text.Pandoc.XML (fromEntities) +import Data.Maybe (fromMaybe) +import Text.HTML.TagSoup +import Data.Char (isAlphaNum) +import qualified Data.Foldable as F +import Control.Monad.Except (throwError) +import Text.Pandoc.Class (PandocMonad, report) + +-- | Read twiki from an input string and return a Pandoc document. +readTWiki :: PandocMonad m + => ReaderOptions + -> String + -> m Pandoc +readTWiki opts s = do + res <- readWithM parseTWiki def{ stateOptions = opts } (s ++ "\n\n") + case res of + Left e -> throwError e + Right d -> return d + +type TWParser = ParserT [Char] ParserState + +-- +-- utility functions +-- + +tryMsg :: String -> TWParser m a -> TWParser m a +tryMsg msg p = try p <?> msg + +skip :: TWParser m a -> TWParser m () +skip parser = parser >> return () + +nested :: PandocMonad m => TWParser m a -> TWParser m a +nested p = do + nestlevel <- stateMaxNestingLevel <$> getState + guard $ nestlevel > 0 + updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } + res <- p + updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } + return res + +htmlElement :: PandocMonad m => String -> TWParser m (Attr, String) +htmlElement tag = tryMsg tag $ do + (TagOpen _ attr, _) <- htmlTag (~== TagOpen tag []) + content <- manyTill anyChar (endtag <|> endofinput) + return (htmlAttrToPandoc attr, trim content) + where + endtag = skip $ htmlTag (~== TagClose tag) + endofinput = lookAhead $ try $ skipMany blankline >> skipSpaces >> eof + trim = dropWhile (=='\n') . reverse . dropWhile (=='\n') . reverse + +htmlAttrToPandoc :: [Attribute String] -> Attr +htmlAttrToPandoc attrs = (ident, classes, keyvals) + where + ident = fromMaybe "" $ lookup "id" attrs + classes = maybe [] words $ lookup "class" attrs + keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + +parseHtmlContentWithAttrs :: PandocMonad m + => String -> TWParser m a -> TWParser m (Attr, [a]) +parseHtmlContentWithAttrs tag parser = do + (attr, content) <- htmlElement tag + parsedContent <- try $ parseContent content + return (attr, parsedContent) + where + parseContent = parseFromString $ nested $ manyTill parser endOfContent + endOfContent = try $ skipMany blankline >> skipSpaces >> eof + +parseHtmlContent :: PandocMonad m => String -> TWParser m a -> TWParser m [a] +parseHtmlContent tag p = parseHtmlContentWithAttrs tag p >>= return . snd + +-- +-- main parser +-- + +parseTWiki :: PandocMonad m => TWParser m Pandoc +parseTWiki = do + bs <- mconcat <$> many block + spaces + eof + return $ B.doc bs + + +-- +-- block parsers +-- + +block :: PandocMonad m => TWParser m B.Blocks +block = do + pos <- getPosition + res <- mempty <$ skipMany1 blankline + <|> blockElements + <|> para + skipMany blankline + report $ ParsingTrace (take 60 $ show $ B.toList res) pos + return res + +blockElements :: PandocMonad m => TWParser m B.Blocks +blockElements = choice [ separator + , header + , verbatim + , literal + , list "" + , table + , blockQuote + , noautolink + ] + +separator :: PandocMonad m => TWParser m B.Blocks +separator = tryMsg "separator" $ string "---" >> newline >> return B.horizontalRule + +header :: PandocMonad m => TWParser m B.Blocks +header = tryMsg "header" $ do + string "---" + level <- many1 (char '+') >>= return . length + guard $ level <= 6 + classes <- option [] $ string "!!" >> return ["unnumbered"] + skipSpaces + content <- B.trimInlines . mconcat <$> manyTill inline newline + attr <- registerHeader ("", classes, []) content + return $ B.headerWith attr level $ content + +verbatim :: PandocMonad m => TWParser m B.Blocks +verbatim = (htmlElement "verbatim" <|> htmlElement "pre") + >>= return . (uncurry B.codeBlockWith) + +literal :: PandocMonad m => TWParser m B.Blocks +literal = htmlElement "literal" >>= return . rawBlock + where + format (_, _, kvs) = fromMaybe "html" $ lookup "format" kvs + rawBlock (attrs, content) = B.rawBlock (format attrs) content + +list :: PandocMonad m => String -> TWParser m B.Blocks +list prefix = choice [ bulletList prefix + , orderedList prefix + , definitionList prefix] + +definitionList :: PandocMonad m => String -> TWParser m B.Blocks +definitionList prefix = tryMsg "definitionList" $ do + indent <- lookAhead $ string prefix *> (many1 $ string " ") <* string "$ " + elements <- many $ parseDefinitionListItem (prefix ++ concat indent) + return $ B.definitionList elements + where + parseDefinitionListItem :: PandocMonad m + => String -> TWParser m (B.Inlines, [B.Blocks]) + parseDefinitionListItem indent = do + string (indent ++ "$ ") >> skipSpaces + term <- many1Till inline $ string ": " + line <- listItemLine indent $ string "$ " + return $ (mconcat term, [line]) + +bulletList :: PandocMonad m => String -> TWParser m B.Blocks +bulletList prefix = tryMsg "bulletList" $ + parseList prefix (char '*') (char ' ') + +orderedList :: PandocMonad m => String -> TWParser m B.Blocks +orderedList prefix = tryMsg "orderedList" $ + parseList prefix (oneOf "1iIaA") (string ". ") + +parseList :: PandocMonad m + => String -> TWParser m Char -> TWParser m a -> TWParser m B.Blocks +parseList prefix marker delim = do + (indent, style) <- lookAhead $ string prefix *> listStyle <* delim + blocks <- many $ parseListItem (prefix ++ indent) (char style <* delim) + return $ case style of + '1' -> B.orderedListWith (1, DefaultStyle, DefaultDelim) blocks + 'i' -> B.orderedListWith (1, LowerRoman, DefaultDelim) blocks + 'I' -> B.orderedListWith (1, UpperRoman, DefaultDelim) blocks + 'a' -> B.orderedListWith (1, LowerAlpha, DefaultDelim) blocks + 'A' -> B.orderedListWith (1, UpperAlpha, DefaultDelim) blocks + _ -> B.bulletList blocks + where + listStyle = do + indent <- many1 $ string " " + style <- marker + return (concat indent, style) + +parseListItem :: (PandocMonad m, Show a) + => String -> TWParser m a -> TWParser m B.Blocks +parseListItem prefix marker = string prefix >> marker >> listItemLine prefix marker + +listItemLine :: (PandocMonad m, Show a) + => String -> TWParser m a -> TWParser m B.Blocks +listItemLine prefix marker = lineContent >>= parseContent >>= return . mconcat + where + lineContent = do + content <- anyLine + continuation <- optionMaybe listContinuation + return $ filterSpaces content ++ "\n" ++ (maybe "" (" " ++) continuation) + filterSpaces = reverse . dropWhile (== ' ') . reverse + listContinuation = notFollowedBy (string prefix >> marker) >> + string " " >> lineContent + parseContent = parseFromString $ many1 $ nestedList <|> parseInline + parseInline = many1Till inline (lastNewline <|> newlineBeforeNestedList) >>= + return . B.plain . mconcat + nestedList = list prefix + lastNewline = try $ char '\n' <* eof + newlineBeforeNestedList = try $ char '\n' <* lookAhead nestedList + +table :: PandocMonad m => TWParser m B.Blocks +table = try $ do + tableHead <- optionMaybe $ many1Till tableParseHeader newline >>= return . unzip + rows <- many1 tableParseRow + return $ buildTable mempty rows $ fromMaybe (align rows, columns rows) tableHead + where + buildTable caption rows (aligns, heads) + = B.table caption aligns heads rows + align rows = replicate (columCount rows) (AlignDefault, 0) + columns rows = replicate (columCount rows) mempty + columCount rows = length $ head rows + +tableParseHeader :: PandocMonad m => TWParser m ((Alignment, Double), B.Blocks) +tableParseHeader = try $ do + char '|' + leftSpaces <- many spaceChar >>= return . length + char '*' + content <- tableColumnContent (char '*' >> skipSpaces >> char '|') + char '*' + rightSpaces <- many spaceChar >>= return . length + optional tableEndOfRow + return (tableAlign leftSpaces rightSpaces, content) + where + tableAlign left right + | left >= 2 && left == right = (AlignCenter, 0) + | left > right = (AlignRight, 0) + | otherwise = (AlignLeft, 0) + +tableParseRow :: PandocMonad m => TWParser m [B.Blocks] +tableParseRow = many1Till tableParseColumn newline + +tableParseColumn :: PandocMonad m => TWParser m B.Blocks +tableParseColumn = char '|' *> skipSpaces *> + tableColumnContent (skipSpaces >> char '|') + <* skipSpaces <* optional tableEndOfRow + +tableEndOfRow :: PandocMonad m => TWParser m Char +tableEndOfRow = lookAhead (try $ char '|' >> char '\n') >> char '|' + +tableColumnContent :: PandocMonad m => TWParser m a -> TWParser m B.Blocks +tableColumnContent end = manyTill content (lookAhead $ try end) >>= return . B.plain . mconcat + where + content = continuation <|> inline + continuation = try $ char '\\' >> newline >> return mempty + +blockQuote :: PandocMonad m => TWParser m B.Blocks +blockQuote = parseHtmlContent "blockquote" block >>= return . B.blockQuote . mconcat + +noautolink :: PandocMonad m => TWParser m B.Blocks +noautolink = do + (_, content) <- htmlElement "noautolink" + st <- getState + setState $ st{ stateAllowLinks = False } + blocks <- try $ parseContent content + setState $ st{ stateAllowLinks = True } + return $ mconcat blocks + where + parseContent = parseFromString $ many $ block + +para :: PandocMonad m => TWParser m B.Blocks +para = many1Till inline endOfParaElement >>= return . result . mconcat + where + endOfParaElement = lookAhead $ endOfInput <|> endOfPara <|> newBlockElement + endOfInput = try $ skipMany blankline >> skipSpaces >> eof + endOfPara = try $ blankline >> skipMany1 blankline + newBlockElement = try $ blankline >> skip blockElements + result content = if F.all (==Space) content + then mempty + else B.para $ B.trimInlines content + + +-- +-- inline parsers +-- + +inline :: PandocMonad m => TWParser m B.Inlines +inline = choice [ whitespace + , br + , macro + , strong + , strongHtml + , strongAndEmph + , emph + , emphHtml + , boldCode + , smart + , link + , htmlComment + , code + , codeHtml + , nop + , autoLink + , str + , symbol + ] <?> "inline" + +whitespace :: PandocMonad m => TWParser m B.Inlines +whitespace = (lb <|> regsp) >>= return + where lb = try $ skipMany spaceChar >> linebreak >> return B.space + regsp = try $ skipMany1 spaceChar >> return B.space + +br :: PandocMonad m => TWParser m B.Inlines +br = try $ string "%BR%" >> return B.linebreak + +linebreak :: PandocMonad m => TWParser m B.Inlines +linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline) + where lastNewline = eof >> return mempty + innerNewline = return B.space + +between :: (Monoid c, PandocMonad m) + => TWParser m a -> TWParser m b -> (TWParser m b -> TWParser m c) + -> TWParser m c +between start end p = + mconcat <$> try (start >> notFollowedBy whitespace >> many1Till (p end) end) + +enclosed :: (Monoid b, PandocMonad m) + => TWParser m a -> (TWParser m a -> TWParser m b) -> TWParser m b +enclosed sep p = between sep (try $ sep <* endMarker) p + where + endMarker = lookAhead $ skip endSpace <|> skip (oneOf ".,!?:)|") <|> eof + endSpace = (spaceChar <|> newline) >> return B.space + +macro :: PandocMonad m => TWParser m B.Inlines +macro = macroWithParameters <|> withoutParameters + where + withoutParameters = enclosed (char '%') (\_ -> macroName) >>= return . emptySpan + emptySpan name = buildSpan name [] mempty + +macroWithParameters :: PandocMonad m => TWParser m B.Inlines +macroWithParameters = try $ do + char '%' + name <- macroName + (content, kvs) <- attributes + char '%' + return $ buildSpan name kvs $ B.str content + +buildSpan :: String -> [(String, String)] -> B.Inlines -> B.Inlines +buildSpan className kvs = B.spanWith attrs + where + attrs = ("", ["twiki-macro", className] ++ additionalClasses, kvsWithoutClasses) + additionalClasses = maybe [] words $ lookup "class" kvs + kvsWithoutClasses = [(k,v) | (k,v) <- kvs, k /= "class"] + +macroName :: PandocMonad m => TWParser m String +macroName = do + first <- letter + rest <- many $ alphaNum <|> char '_' + return (first:rest) + +attributes :: PandocMonad m => TWParser m (String, [(String, String)]) +attributes = char '{' *> spnl *> many (attribute <* spnl) <* char '}' >>= + return . foldr (either mkContent mkKvs) ([], []) + where + spnl = skipMany (spaceChar <|> newline) + mkContent c ([], kvs) = (c, kvs) + mkContent c (rest, kvs) = (c ++ " " ++ rest, kvs) + mkKvs kv (cont, rest) = (cont, (kv : rest)) + +attribute :: PandocMonad m => TWParser m (Either String (String, String)) +attribute = withKey <|> withoutKey + where + withKey = try $ do + key <- macroName + char '=' + parseValue False >>= return . (curry Right key) + withoutKey = try $ parseValue True >>= return . Left + parseValue allowSpaces = (withQuotes <|> withoutQuotes allowSpaces) >>= return . fromEntities + withQuotes = between (char '"') (char '"') (\_ -> count 1 $ noneOf ['"']) + withoutQuotes allowSpaces + | allowSpaces == True = many1 $ noneOf "}" + | otherwise = many1 $ noneOf " }" + +nestedInlines :: (Show a, PandocMonad m) + => TWParser m a -> TWParser m B.Inlines +nestedInlines end = innerSpace <|> nestedInline + where + innerSpace = try $ whitespace <* (notFollowedBy end) + nestedInline = notFollowedBy whitespace >> nested inline + +strong :: PandocMonad m => TWParser m B.Inlines +strong = try $ enclosed (char '*') nestedInlines >>= return . B.strong + +strongHtml :: PandocMonad m => TWParser m B.Inlines +strongHtml = (parseHtmlContent "strong" inline <|> parseHtmlContent "b" inline) + >>= return . B.strong . mconcat + +strongAndEmph :: PandocMonad m => TWParser m B.Inlines +strongAndEmph = try $ enclosed (string "__") nestedInlines >>= return . B.emph . B.strong + +emph :: PandocMonad m => TWParser m B.Inlines +emph = try $ enclosed (char '_') nestedInlines >>= return . B.emph + +emphHtml :: PandocMonad m => TWParser m B.Inlines +emphHtml = (parseHtmlContent "em" inline <|> parseHtmlContent "i" inline) + >>= return . B.emph . mconcat + +nestedString :: (Show a, PandocMonad m) + => TWParser m a -> TWParser m String +nestedString end = innerSpace <|> (count 1 nonspaceChar) + where + innerSpace = try $ many1 spaceChar <* notFollowedBy end + +boldCode :: PandocMonad m => TWParser m B.Inlines +boldCode = try $ enclosed (string "==") nestedString >>= return . B.strong . B.code . fromEntities + +htmlComment :: PandocMonad m => TWParser m B.Inlines +htmlComment = htmlTag isCommentTag >> return mempty + +code :: PandocMonad m => TWParser m B.Inlines +code = try $ enclosed (char '=') nestedString >>= return . B.code . fromEntities + +codeHtml :: PandocMonad m => TWParser m B.Inlines +codeHtml = do + (attrs, content) <- parseHtmlContentWithAttrs "code" anyChar + return $ B.codeWith attrs $ fromEntities content + +autoLink :: PandocMonad m => TWParser m B.Inlines +autoLink = try $ do + state <- getState + guard $ stateAllowLinks state + (text, url) <- parseLink + guard $ checkLink (head $ reverse url) + return $ makeLink (text, url) + where + parseLink = notFollowedBy nop >> (uri <|> emailAddress) + makeLink (text, url) = B.link url "" $ B.str text + checkLink c + | c == '/' = True + | otherwise = isAlphaNum c + +str :: PandocMonad m => TWParser m B.Inlines +str = (many1 alphaNum <|> count 1 characterReference) >>= return . B.str + +nop :: PandocMonad m => TWParser m B.Inlines +nop = try $ (skip exclamation <|> skip nopTag) >> followContent + where + exclamation = char '!' + nopTag = stringAnyCase "<nop>" + followContent = many1 nonspaceChar >>= return . B.str . fromEntities + +symbol :: PandocMonad m => TWParser m B.Inlines +symbol = count 1 nonspaceChar >>= return . B.str + +smart :: PandocMonad m => TWParser m B.Inlines +smart = do + guardEnabled Ext_smart + doubleQuoted <|> singleQuoted <|> + choice [ apostrophe + , dash + , ellipses + ] + +singleQuoted :: PandocMonad m => TWParser m B.Inlines +singleQuoted = try $ do + singleQuoteStart + withQuoteContext InSingleQuote $ + many1Till inline singleQuoteEnd >>= + (return . B.singleQuoted . B.trimInlines . mconcat) + +doubleQuoted :: PandocMonad m => TWParser m B.Inlines +doubleQuoted = try $ do + doubleQuoteStart + contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) + (withQuoteContext InDoubleQuote $ doubleQuoteEnd >> + return (B.doubleQuoted $ B.trimInlines contents)) + <|> (return $ (B.str "\8220") B.<> contents) + +link :: PandocMonad m => TWParser m B.Inlines +link = try $ do + st <- getState + guard $ stateAllowLinks st + setState $ st{ stateAllowLinks = False } + (url, title, content) <- linkText + setState $ st{ stateAllowLinks = True } + return $ B.link url title content + +linkText :: PandocMonad m => TWParser m (String, String, B.Inlines) +linkText = do + string "[[" + url <- many1Till anyChar (char ']') + content <- option [B.str url] linkContent + char ']' + return (url, "", mconcat content) + where + linkContent = (char '[') >> many1Till anyChar (char ']') >>= parseLinkContent + parseLinkContent = parseFromString $ many1 inline diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs new file mode 100644 index 000000000..6594b9ab8 --- /dev/null +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -0,0 +1,729 @@ +{- +Copyright (C) 2010-2015 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@' + and John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Textile + Copyright : Copyright (C) 2010-2015 Paul Rivier and John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : Paul Rivier <paul*rivier#demotera*com> + Stability : alpha + Portability : portable + +Conversion from Textile to 'Pandoc' document, based on the spec +available at http://redcloth.org/textile. + +Implemented and parsed: + - Paragraphs + - Code blocks + - Lists + - blockquote + - Inlines : strong, emph, cite, code, deleted, superscript, + subscript, links + - footnotes + - HTML-specific and CSS-specific attributes on headers + +Left to be implemented: + - dimension sign + - all caps + - continued blocks (ex bq..) + +TODO : refactor common patterns across readers : + - more ... + +-} + + +module Text.Pandoc.Readers.Textile ( readTextile) where +import Text.Pandoc.CSS +import Text.Pandoc.Definition +import Text.Pandoc.Builder (Inlines, Blocks, trimInlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Options +import Text.Pandoc.Parsing +import Text.Pandoc.Logging +import Text.Pandoc.Readers.HTML ( htmlTag, isBlockTag, isInlineTag ) +import Text.Pandoc.Shared (trim) +import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) +import Text.HTML.TagSoup (fromAttrib, Tag(..)) +import Text.HTML.TagSoup.Match +import Data.List ( intercalate, transpose, intersperse ) +import Data.Char ( digitToInt, isUpper ) +import Control.Monad ( guard, liftM ) +import Data.Monoid ((<>)) +import Text.Pandoc.Class (PandocMonad, report) +import Control.Monad.Except (throwError) + +-- | Parse a Textile text and return a Pandoc document. +readTextile :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> m Pandoc +readTextile opts s = do + parsed <- readWithM parseTextile def{ stateOptions = opts } (s ++ "\n\n") + case parsed of + Right result -> return result + Left e -> throwError e + + +-- | Generate a Pandoc ADT from a textile document +parseTextile :: PandocMonad m => ParserT [Char] ParserState m Pandoc +parseTextile = do + many blankline + startPos <- getPosition + -- go through once just to get list of reference keys and notes + -- docMinusKeys is the raw document with blanks where the keys/notes were... + let firstPassParser = noteBlock <|> lineClump + manyTill firstPassParser eof >>= setInput . concat + setPosition startPos + st' <- getState + let reversedNotes = stateNotes st' + updateState $ \s -> s { stateNotes = reverse reversedNotes } + -- now parse it for real... + blocks <- parseBlocks + return $ Pandoc nullMeta (B.toList blocks) -- FIXME + +noteMarker :: PandocMonad m => ParserT [Char] ParserState m [Char] +noteMarker = skipMany spaceChar >> string "fn" >> manyTill digit (char '.') + +noteBlock :: PandocMonad m => ParserT [Char] ParserState m [Char] +noteBlock = try $ do + startPos <- getPosition + ref <- noteMarker + optional blankline + contents <- liftM unlines $ many1Till anyLine (blanklines <|> noteBlock) + endPos <- getPosition + let newnote = (ref, contents ++ "\n") + st <- getState + let oldnotes = stateNotes st + updateState $ \s -> s { stateNotes = newnote : oldnotes } + -- return blanks so line count isn't affected + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +-- | Parse document blocks +parseBlocks :: PandocMonad m => ParserT [Char] ParserState m Blocks +parseBlocks = mconcat <$> manyTill block eof + +-- | Block parsers list tried in definition order +blockParsers :: PandocMonad m => [ParserT [Char] ParserState m Blocks] +blockParsers = [ codeBlock + , header + , blockQuote + , hrule + , commentBlock + , anyList + , rawHtmlBlock + , rawLaTeXBlock' + , table + , maybeExplicitBlock "p" para + , mempty <$ blanklines + ] + +-- | Any block in the order of definition of blockParsers +block :: PandocMonad m => ParserT [Char] ParserState m Blocks +block = do + res <- choice blockParsers <?> "block" + pos <- getPosition + report $ ParsingTrace (take 60 $ show $ B.toList res) pos + return res + +commentBlock :: PandocMonad m => ParserT [Char] ParserState m Blocks +commentBlock = try $ do + string "###." + manyTill anyLine blanklines + return mempty + +codeBlock :: PandocMonad m => ParserT [Char] ParserState m Blocks +codeBlock = codeBlockBc <|> codeBlockPre + +codeBlockBc :: PandocMonad m => ParserT [Char] ParserState m Blocks +codeBlockBc = try $ do + string "bc." + extended <- option False (True <$ char '.') + char ' ' + let starts = ["p", "table", "bq", "bc", "h1", "h2", "h3", + "h4", "h5", "h6", "pre", "###", "notextile"] + let ender = choice $ map explicitBlockStart starts + contents <- if extended + then do + f <- anyLine + rest <- many (notFollowedBy ender *> anyLine) + return (f:rest) + else manyTill anyLine blanklines + return $ B.codeBlock (trimTrailingNewlines (unlines contents)) + +trimTrailingNewlines :: String -> String +trimTrailingNewlines = reverse . dropWhile (=='\n') . reverse + +-- | Code Blocks in Textile are between <pre> and </pre> +codeBlockPre :: PandocMonad m => ParserT [Char] ParserState m Blocks +codeBlockPre = try $ do + (t@(TagOpen _ attrs),_) <- htmlTag (tagOpen (=="pre") (const True)) + result' <- manyTill anyChar (htmlTag (tagClose (=="pre"))) + optional blanklines + -- drop leading newline if any + let result'' = case result' of + '\n':xs -> xs + _ -> result' + -- drop trailing newline if any + let result''' = case reverse result'' of + '\n':_ -> init result'' + _ -> result'' + let classes = words $ fromAttrib "class" t + let ident = fromAttrib "id" t + let kvs = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + return $ B.codeBlockWith (ident,classes,kvs) result''' + +-- | Header of the form "hN. content" with N in 1..6 +header :: PandocMonad m => ParserT [Char] ParserState m Blocks +header = try $ do + char 'h' + level <- digitToInt <$> oneOf "123456" + attr <- attributes + char '.' + lookAhead whitespace + name <- trimInlines . mconcat <$> many inline + attr' <- registerHeader attr name + return $ B.headerWith attr' level name + +-- | Blockquote of the form "bq. content" +blockQuote :: PandocMonad m => ParserT [Char] ParserState m Blocks +blockQuote = try $ do + string "bq" >> attributes >> char '.' >> whitespace + B.blockQuote <$> para + +-- Horizontal rule + +hrule :: PandocMonad m => ParserT [Char] st m Blocks +hrule = try $ do + skipSpaces + start <- oneOf "-*" + count 2 (skipSpaces >> char start) + skipMany (spaceChar <|> char start) + newline + optional blanklines + return B.horizontalRule + +-- Lists handling + +-- | Can be a bullet list or an ordered list. This implementation is +-- strict in the nesting, sublist must start at exactly "parent depth +-- plus one" +anyList :: PandocMonad m => ParserT [Char] ParserState m Blocks +anyList = try $ anyListAtDepth 1 <* blanklines + +-- | This allow one type of list to be nested into an other type, +-- provided correct nesting +anyListAtDepth :: PandocMonad m => Int -> ParserT [Char] ParserState m Blocks +anyListAtDepth depth = choice [ bulletListAtDepth depth, + orderedListAtDepth depth, + definitionList ] + +-- | Bullet List of given depth, depth being the number of leading '*' +bulletListAtDepth :: PandocMonad m => Int -> ParserT [Char] ParserState m Blocks +bulletListAtDepth depth = try $ B.bulletList <$> many1 (bulletListItemAtDepth depth) + +-- | Bullet List Item of given depth, depth being the number of +-- leading '*' +bulletListItemAtDepth :: PandocMonad m => Int -> ParserT [Char] ParserState m Blocks +bulletListItemAtDepth = genericListItemAtDepth '*' + +-- | Ordered List of given depth, depth being the number of +-- leading '#' +orderedListAtDepth :: PandocMonad m => Int -> ParserT [Char] ParserState m Blocks +orderedListAtDepth depth = try $ do + items <- many1 (orderedListItemAtDepth depth) + return $ B.orderedList items + +-- | Ordered List Item of given depth, depth being the number of +-- leading '#' +orderedListItemAtDepth :: PandocMonad m => Int -> ParserT [Char] ParserState m Blocks +orderedListItemAtDepth = genericListItemAtDepth '#' + +-- | Common implementation of list items +genericListItemAtDepth :: PandocMonad m => Char -> Int -> ParserT [Char] ParserState m Blocks +genericListItemAtDepth c depth = try $ do + count depth (char c) >> attributes >> whitespace + p <- mconcat <$> many listInline + newline + sublist <- option mempty (anyListAtDepth (depth + 1)) + return $ (B.plain p) <> sublist + +-- | A definition list is a set of consecutive definition items +definitionList :: PandocMonad m => ParserT [Char] ParserState m Blocks +definitionList = try $ B.definitionList <$> many1 definitionListItem + +-- | List start character. +listStart :: PandocMonad m => ParserT [Char] ParserState m () +listStart = genericListStart '*' + <|> () <$ genericListStart '#' + <|> () <$ definitionListStart + +genericListStart :: PandocMonad m => Char -> ParserT [Char] st m () +genericListStart c = () <$ try (many1 (char c) >> whitespace) + +basicDLStart :: PandocMonad m => ParserT [Char] ParserState m () +basicDLStart = do + char '-' + whitespace + notFollowedBy newline + +definitionListStart :: PandocMonad m => ParserT [Char] ParserState m Inlines +definitionListStart = try $ do + basicDLStart + trimInlines . mconcat <$> + many1Till inline + ( try (newline *> lookAhead basicDLStart) + <|> try (lookAhead (() <$ string ":=")) + ) + +listInline :: PandocMonad m => ParserT [Char] ParserState m Inlines +listInline = try (notFollowedBy newline >> inline) + <|> try (endline <* notFollowedBy listStart) + +-- | A definition list item in textile begins with '- ', followed by +-- the term defined, then spaces and ":=". The definition follows, on +-- the same single line, or spaned on multiple line, after a line +-- break. +definitionListItem :: PandocMonad m => ParserT [Char] ParserState m (Inlines, [Blocks]) +definitionListItem = try $ do + term <- (mconcat . intersperse B.linebreak) <$> many1 definitionListStart + def' <- string ":=" *> optional whitespace *> (multilineDef <|> inlineDef) + return (term, def') + where inlineDef :: PandocMonad m => ParserT [Char] ParserState m [Blocks] + inlineDef = liftM (\d -> [B.plain d]) + $ optional whitespace >> (trimInlines . mconcat <$> many listInline) <* newline + multilineDef :: PandocMonad m => ParserT [Char] ParserState m [Blocks] + multilineDef = try $ do + optional whitespace >> newline + s <- many1Till anyChar (try (string "=:" >> newline)) + -- this ++ "\n\n" does not look very good + ds <- parseFromString parseBlocks (s ++ "\n\n") + return [ds] + +-- raw content + +-- | A raw Html Block, optionally followed by blanklines +rawHtmlBlock :: PandocMonad m => ParserT [Char] ParserState m Blocks +rawHtmlBlock = try $ do + skipMany spaceChar + (_,b) <- htmlTag isBlockTag + optional blanklines + return $ B.rawBlock "html" b + +-- | Raw block of LaTeX content +rawLaTeXBlock' :: PandocMonad m => ParserT [Char] ParserState m Blocks +rawLaTeXBlock' = do + guardEnabled Ext_raw_tex + B.rawBlock "latex" <$> (rawLaTeXBlock <* spaces) + + +-- | In textile, paragraphs are separated by blank lines. +para :: PandocMonad m => ParserT [Char] ParserState m Blocks +para = B.para . trimInlines . mconcat <$> many1 inline + +-- Tables + +toAlignment :: Char -> Alignment +toAlignment '<' = AlignLeft +toAlignment '>' = AlignRight +toAlignment '=' = AlignCenter +toAlignment _ = AlignDefault + +cellAttributes :: PandocMonad m => ParserT [Char] ParserState m (Bool, Alignment) +cellAttributes = try $ do + isHeader <- option False (True <$ char '_') + -- we just ignore colspan and rowspan markers: + optional $ try $ oneOf "/\\" >> many1 digit + -- we pay attention to alignments: + alignment <- option AlignDefault $ toAlignment <$> oneOf "<>=" + -- ignore other attributes for now: + _ <- attributes + char '.' + return (isHeader, alignment) + +-- | A table cell spans until a pipe | +tableCell :: PandocMonad m => ParserT [Char] ParserState m ((Bool, Alignment), Blocks) +tableCell = try $ do + char '|' + (isHeader, alignment) <- option (False, AlignDefault) $ cellAttributes + notFollowedBy blankline + raw <- trim <$> + many (noneOf "|\n" <|> try (char '\n' <* notFollowedBy blankline)) + content <- mconcat <$> parseFromString (many inline) raw + return ((isHeader, alignment), B.plain content) + +-- | A table row is made of many table cells +tableRow :: PandocMonad m => ParserT [Char] ParserState m [((Bool, Alignment), Blocks)] +tableRow = try $ do + -- skip optional row attributes + optional $ try $ do + _ <- attributes + char '.' + many1 spaceChar + many1 tableCell <* char '|' <* blankline + +-- | A table with an optional header. +table :: PandocMonad m => ParserT [Char] ParserState m Blocks +table = try $ do + -- ignore table attributes + caption <- option mempty $ try $ do + string "table" + _ <- attributes + char '.' + rawcapt <- trim <$> anyLine + parseFromString (mconcat <$> many inline) rawcapt + rawrows <- many1 $ (skipMany ignorableRow) >> tableRow + skipMany ignorableRow + blanklines + let (headers, rows) = case rawrows of + (toprow:rest) | any (fst . fst) toprow -> + (toprow, rest) + _ -> (mempty, rawrows) + let nbOfCols = max (length headers) (length $ head rows) + let aligns = map minimum $ transpose $ map (map (snd . fst)) (headers:rows) + return $ B.table caption + (zip aligns (replicate nbOfCols 0.0)) + (map snd headers) + (map (map snd) rows) + +-- | Ignore markers for cols, thead, tfoot. +ignorableRow :: PandocMonad m => ParserT [Char] ParserState m () +ignorableRow = try $ do + char '|' + oneOf ":^-~" + _ <- attributes + char '.' + _ <- anyLine + return () + +explicitBlockStart :: PandocMonad m => String -> ParserT [Char] ParserState m () +explicitBlockStart name = try $ do + string name + attributes + char '.' + optional whitespace + optional endline + +-- | Blocks like 'p' and 'table' do not need explicit block tag. +-- However, they can be used to set HTML/CSS attributes when needed. +maybeExplicitBlock :: PandocMonad m + => String -- ^ block tag name + -> ParserT [Char] ParserState m Blocks -- ^ implicit block + -> ParserT [Char] ParserState m Blocks +maybeExplicitBlock name blk = try $ do + optional $ explicitBlockStart name + blk + + + +---------- +-- Inlines +---------- + + +-- | Any inline element +inline :: PandocMonad m => ParserT [Char] ParserState m Inlines +inline = do + choice inlineParsers <?> "inline" + +-- | Inline parsers tried in order +inlineParsers :: PandocMonad m => [ParserT [Char] ParserState m Inlines] +inlineParsers = [ str + , whitespace + , endline + , code + , escapedInline + , inlineMarkup + , groupedInlineMarkup + , rawHtmlInline + , rawLaTeXInline' + , note + , link + , image + , mark + , (B.str . (:[])) <$> characterReference + , smartPunctuation inline + , symbol + ] + +-- | Inline markups +inlineMarkup :: PandocMonad m => ParserT [Char] ParserState m Inlines +inlineMarkup = choice [ simpleInline (string "??") (B.cite []) + , simpleInline (string "**") B.strong + , simpleInline (string "__") B.emph + , simpleInline (char '*') B.strong + , simpleInline (char '_') B.emph + , simpleInline (char '+') B.emph -- approximates underline + , simpleInline (char '-' <* notFollowedBy (char '-')) B.strikeout + , simpleInline (char '^') B.superscript + , simpleInline (char '~') B.subscript + , simpleInline (char '%') id + ] + +-- | Trademark, registered, copyright +mark :: PandocMonad m => ParserT [Char] st m Inlines +mark = try $ char '(' >> (try tm <|> try reg <|> copy) + +reg :: PandocMonad m => ParserT [Char] st m Inlines +reg = do + oneOf "Rr" + char ')' + return $ B.str "\174" + +tm :: PandocMonad m => ParserT [Char] st m Inlines +tm = do + oneOf "Tt" + oneOf "Mm" + char ')' + return $ B.str "\8482" + +copy :: PandocMonad m => ParserT [Char] st m Inlines +copy = do + oneOf "Cc" + char ')' + return $ B.str "\169" + +note :: PandocMonad m => ParserT [Char] ParserState m Inlines +note = try $ do + ref <- (char '[' *> many1 digit <* char ']') + notes <- stateNotes <$> getState + case lookup ref notes of + Nothing -> fail "note not found" + Just raw -> B.note <$> parseFromString parseBlocks raw + +-- | Special chars +markupChars :: [Char] +markupChars = "\\*#_@~-+^|%=[]&" + +-- | Break strings on following chars. Space tab and newline break for +-- inlines breaking. Open paren breaks for mark. Quote, dash and dot +-- break for smart punctuation. Punctuation breaks for regular +-- punctuation. Double quote breaks for named links. > and < break +-- for inline html. +stringBreakers :: [Char] +stringBreakers = " \t\n\r.,\"'?!;:<>«»„“”‚‘’()[]" + +wordBoundaries :: [Char] +wordBoundaries = markupChars ++ stringBreakers + +-- | Parse a hyphened sequence of words +hyphenedWords :: PandocMonad m => ParserT [Char] ParserState m String +hyphenedWords = do + x <- wordChunk + xs <- many (try $ char '-' >> wordChunk) + return $ intercalate "-" (x:xs) + +wordChunk :: PandocMonad m => ParserT [Char] ParserState m String +wordChunk = try $ do + hd <- noneOf wordBoundaries + tl <- many ( (noneOf wordBoundaries) <|> + try (notFollowedBy' note *> oneOf markupChars + <* lookAhead (noneOf wordBoundaries) ) ) + return $ hd:tl + +-- | Any string +str :: PandocMonad m => ParserT [Char] ParserState m Inlines +str = do + baseStr <- hyphenedWords + -- RedCloth compliance : if parsed word is uppercase and immediatly + -- followed by parens, parens content is unconditionally word acronym + fullStr <- option baseStr $ try $ do + guard $ all isUpper baseStr + acro <- enclosed (char '(') (char ')') anyChar' + return $ concat [baseStr, " (", acro, ")"] + updateLastStrPos + return $ B.str fullStr + +-- | Some number of space chars +whitespace :: PandocMonad m => ParserT [Char] st m Inlines +whitespace = many1 spaceChar >> return B.space <?> "whitespace" + +-- | In Textile, an isolated endline character is a line break +endline :: PandocMonad m => ParserT [Char] ParserState m Inlines +endline = try $ do + newline + notFollowedBy blankline + notFollowedBy listStart + notFollowedBy rawHtmlBlock + return B.linebreak + +rawHtmlInline :: PandocMonad m => ParserT [Char] ParserState m Inlines +rawHtmlInline = B.rawInline "html" . snd <$> htmlTag isInlineTag + +-- | Raw LaTeX Inline +rawLaTeXInline' :: PandocMonad m => ParserT [Char] ParserState m Inlines +rawLaTeXInline' = try $ do + guardEnabled Ext_raw_tex + B.singleton <$> rawLaTeXInline + +-- | Textile standard link syntax is "label":target. But we +-- can also have ["label":target]. +link :: PandocMonad m => ParserT [Char] ParserState m Inlines +link = try $ do + bracketed <- (True <$ char '[') <|> return False + char '"' *> notFollowedBy (oneOf " \t\n\r") + attr <- attributes + name <- trimInlines . mconcat <$> + withQuoteContext InDoubleQuote (many1Till inline (char '"')) + char ':' + let stop = if bracketed + then char ']' + else lookAhead $ space <|> + try (oneOf "!.,;:" *> (space <|> newline)) + url <- many1Till nonspaceChar stop + let name' = if B.toList name == [Str "$"] then B.str url else name + return $ if attr == nullAttr + then B.link url "" name' + else B.spanWith attr $ B.link url "" name' + +-- | image embedding +image :: PandocMonad m => ParserT [Char] ParserState m Inlines +image = try $ do + char '!' >> notFollowedBy space + (ident, cls, kvs) <- attributes + let attr = case lookup "style" kvs of + Just stls -> (ident, cls, pickStylesToKVs ["width", "height"] stls) + Nothing -> (ident, cls, kvs) + src <- many1 (noneOf " \t\n\r!(") + alt <- option "" $ try $ char '(' *> manyTill anyChar (char ')') + char '!' + return $ B.imageWith attr src alt (B.str alt) + +escapedInline :: PandocMonad m => ParserT [Char] ParserState m Inlines +escapedInline = escapedEqs <|> escapedTag + +escapedEqs :: PandocMonad m => ParserT [Char] ParserState m Inlines +escapedEqs = B.str <$> + (try $ string "==" *> manyTill anyChar' (try $ string "==")) + +-- | literal text escaped btw <notextile> tags +escapedTag :: PandocMonad m => ParserT [Char] ParserState m Inlines +escapedTag = B.str <$> + (try $ string "<notextile>" *> + manyTill anyChar' (try $ string "</notextile>")) + +-- | Any special symbol defined in wordBoundaries +symbol :: PandocMonad m => ParserT [Char] ParserState m Inlines +symbol = B.str . singleton <$> (notFollowedBy newline *> + notFollowedBy rawHtmlBlock *> + oneOf wordBoundaries) + +-- | Inline code +code :: PandocMonad m => ParserT [Char] ParserState m Inlines +code = code1 <|> code2 + +-- any character except a newline before a blank line +anyChar' :: PandocMonad m => ParserT [Char] ParserState m Char +anyChar' = + satisfy (/='\n') <|> (try $ char '\n' <* notFollowedBy blankline) + +code1 :: PandocMonad m => ParserT [Char] ParserState m Inlines +code1 = B.code <$> surrounded (char '@') anyChar' + +code2 :: PandocMonad m => ParserT [Char] ParserState m Inlines +code2 = do + htmlTag (tagOpen (=="tt") null) + B.code <$> manyTill anyChar' (try $ htmlTag $ tagClose (=="tt")) + +-- | Html / CSS attributes +attributes :: PandocMonad m => ParserT [Char] ParserState m Attr +attributes = (foldl (flip ($)) ("",[],[])) <$> + try (do special <- option id specialAttribute + attrs <- many attribute + return (special : attrs)) + +specialAttribute :: PandocMonad m => ParserT [Char] ParserState m (Attr -> Attr) +specialAttribute = do + alignStr <- ("center" <$ char '=') <|> + ("justify" <$ try (string "<>")) <|> + ("right" <$ char '>') <|> + ("left" <$ char '<') + notFollowedBy spaceChar + return $ addStyle ("text-align:" ++ alignStr) + +attribute :: PandocMonad m => ParserT [Char] ParserState m (Attr -> Attr) +attribute = try $ + (classIdAttr <|> styleAttr <|> langAttr) <* notFollowedBy spaceChar + +classIdAttr :: PandocMonad m => ParserT [Char] ParserState m (Attr -> Attr) +classIdAttr = try $ do -- (class class #id) + char '(' + ws <- words `fmap` manyTill anyChar' (char ')') + case reverse ws of + [] -> return $ \(_,_,keyvals) -> ("",[],keyvals) + (('#':ident'):classes') -> return $ \(_,_,keyvals) -> + (ident',classes',keyvals) + classes' -> return $ \(_,_,keyvals) -> + ("",classes',keyvals) + +styleAttr :: PandocMonad m => ParserT [Char] ParserState m (Attr -> Attr) +styleAttr = do + style <- try $ enclosed (char '{') (char '}') anyChar' + return $ addStyle style + +addStyle :: String -> Attr -> Attr +addStyle style (id',classes,keyvals) = + (id',classes,keyvals') + where keyvals' = ("style", style') : [(k,v) | (k,v) <- keyvals, k /= "style"] + style' = style ++ ";" ++ concat [v | ("style",v) <- keyvals] + +langAttr :: PandocMonad m => ParserT [Char] ParserState m (Attr -> Attr) +langAttr = do + lang <- try $ enclosed (char '[') (char ']') alphaNum + return $ \(id',classes,keyvals) -> (id',classes,("lang",lang):keyvals) + +-- | Parses material surrounded by a parser. +surrounded :: PandocMonad m + => ParserT [Char] st m t -- ^ surrounding parser + -> ParserT [Char] st m a -- ^ content parser (to be used repeatedly) + -> ParserT [Char] st m [a] +surrounded border = + enclosed (border *> notFollowedBy (oneOf " \t\n\r")) (try border) + +simpleInline :: PandocMonad m + => ParserT [Char] ParserState m t -- ^ surrounding parser + -> (Inlines -> Inlines) -- ^ Inline constructor + -> ParserT [Char] ParserState m Inlines -- ^ content parser (to be used repeatedly) +simpleInline border construct = try $ do + notAfterString + border *> notFollowedBy (oneOf " \t\n\r") + attr <- attributes + body <- trimInlines . mconcat <$> + withQuoteContext InSingleQuote + (manyTill (notFollowedBy newline >> inline) + (try border <* notFollowedBy alphaNum)) + return $ construct $ + if attr == nullAttr + then body + else B.spanWith attr body + +groupedInlineMarkup :: PandocMonad m => ParserT [Char] ParserState m Inlines +groupedInlineMarkup = try $ do + char '[' + sp1 <- option mempty $ B.space <$ whitespace + result <- withQuoteContext InSingleQuote inlineMarkup + sp2 <- option mempty $ B.space <$ whitespace + char ']' + return $ sp1 <> result <> sp2 + +-- | Create a singleton list +singleton :: a -> [a] +singleton x = [x] + diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs new file mode 100644 index 000000000..9e2b6963d --- /dev/null +++ b/src/Text/Pandoc/Readers/Txt2Tags.hs @@ -0,0 +1,596 @@ +{-# LANGUAGE ViewPatterns #-} +{- +Copyright (C) 2014 Matthew Pickering <matthewtpickering@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Txt2Tags + Copyright : Copyright (C) 2014 Matthew Pickering + License : GNU GPL, version 2 or above + + Maintainer : Matthew Pickering <matthewtpickering@gmail.com> + +Conversion of txt2tags formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Txt2Tags ( readTxt2Tags + , getT2TMeta + , T2TMeta (..) + ) + where + +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder ( Inlines, Blocks, trimInlines ) +import Data.Monoid ((<>)) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared (escapeURI,compactify, compactifyDL) +import Text.Pandoc.Parsing hiding (space, spaces, uri, macro) +import Data.Char (toLower) +import Data.List (transpose, intersperse, intercalate) +import Data.Maybe (fromMaybe) +--import Network.URI (isURI) -- Not sure whether to use this function +import Control.Monad (void, guard, when) +import Data.Default +import Control.Monad.Reader (Reader, runReader, asks) + +import Data.Time.Format (formatTime) +import Text.Pandoc.Compat.Time (defaultTimeLocale) +import Control.Monad.Except (throwError, catchError) +import Text.Pandoc.Class (PandocMonad) +import qualified Text.Pandoc.Class as P + +type T2T = ParserT String ParserState (Reader T2TMeta) + +-- | An object for the T2T macros meta information +-- the contents of each field is simply substituted verbatim into the file +data T2TMeta = T2TMeta { + date :: String -- ^ Current date + , mtime :: String -- ^ Last modification time of infile + , infile :: FilePath -- ^ Input file + , outfile :: FilePath -- ^ Output file + } deriving Show + +instance Default T2TMeta where + def = T2TMeta "" "" "" "" + +-- | Get the meta information required by Txt2Tags macros +getT2TMeta :: PandocMonad m => m T2TMeta +getT2TMeta = do + mbInps <- P.getInputFiles + let inps = case mbInps of + Just x -> x + Nothing -> [] + mbOutp <- P.getOutputFile + let outp = case mbOutp of + Just x -> x + Nothing -> "" + curDate <- formatTime defaultTimeLocale "%F" <$> P.getZonedTime + let getModTime = fmap (formatTime defaultTimeLocale "%T") . + P.getModificationTime + curMtime <- case inps of + [] -> formatTime defaultTimeLocale "%T" <$> P.getZonedTime + _ -> catchError + (maximum <$> mapM getModTime inps) + (const (return "")) + return $ T2TMeta curDate curMtime (intercalate ", " inps) outp + +-- | Read Txt2Tags from an input string returning a Pandoc document +readTxt2Tags :: PandocMonad m + => ReaderOptions + -> String + -> m Pandoc +readTxt2Tags opts s = do + meta <- getT2TMeta + let parsed = flip runReader meta $ readWithM parseT2T (def {stateOptions = opts}) (s ++ "\n\n") + case parsed of + Right result -> return $ result + Left e -> throwError e + +-- | Read Txt2Tags (ignoring all macros) from an input string returning +-- a Pandoc document +-- readTxt2TagsNoMacros :: PandocMonad m => ReaderOptions -> String -> m Pandoc +-- readTxt2TagsNoMacros = readTxt2Tags + +parseT2T :: T2T Pandoc +parseT2T = do + -- Parse header if standalone flag is set + standalone <- getOption readerStandalone + when standalone parseHeader + body <- mconcat <$> manyTill block eof + meta' <- stateMeta <$> getState + return $ Pandoc meta' (B.toList body) + +parseHeader :: T2T () +parseHeader = do + () <$ try blankline <|> header + meta <- stateMeta <$> getState + optional blanklines + config <- manyTill setting (notFollowedBy setting) + -- TODO: Handle settings better + let settings = foldr (\(k,v) -> B.setMeta k (MetaString v)) meta config + updateState (\s -> s {stateMeta = settings}) <* optional blanklines + +header :: T2T () +header = titleline >> authorline >> dateline + +headerline :: B.ToMetaValue a => String -> T2T a -> T2T () +headerline field p = (() <$ try blankline) + <|> (p >>= updateState . B.setMeta field) + +titleline :: T2T () +titleline = + headerline "title" (trimInlines . mconcat <$> manyTill inline newline) + +authorline :: T2T () +authorline = + headerline "author" (sepBy author (char ';') <* newline) + where + author = trimInlines . mconcat <$> many (notFollowedBy (char ';' <|> newline) >> inline) + +dateline :: T2T () +dateline = headerline "date" (trimInlines . mconcat <$> manyTill inline newline) + +type Keyword = String +type Value = String + +setting :: T2T (Keyword, Value) +setting = do + string "%!" + keyword <- ignoreSpacesCap (many1 alphaNum) + char ':' + value <- ignoreSpacesCap (manyTill anyChar (newline)) + return (keyword, value) + +-- Blocks + +parseBlocks :: T2T Blocks +parseBlocks = mconcat <$> manyTill block eof + +block :: T2T Blocks +block = do + choice + [ mempty <$ blanklines + , quote + , hrule -- hrule must go above title + , title + , commentBlock + , verbatim + , rawBlock + , taggedBlock + , list + , table + , para + ] + +title :: T2T Blocks +title = try $ balancedTitle '+' <|> balancedTitle '=' + +balancedTitle :: Char -> T2T Blocks +balancedTitle c = try $ do + spaces + level <- length <$> many1 (char c) + guard (level <= 5) -- Max header level 5 + heading <- manyTill (noneOf "\n\r") (count level (char c)) + label <- optionMaybe (enclosed (char '[') (char ']') (alphaNum <|> oneOf "_-")) + many spaceChar *> newline + let attr = maybe nullAttr (\x -> (x, [], [])) label + return $ B.headerWith attr level (trimInlines $ B.text heading) + +para :: T2T Blocks +para = try $ do + ils <- parseInlines + nl <- option False (True <$ newline) + option (B.plain ils) (guard nl >> notFollowedBy listStart >> return (B.para ils)) + where + listStart = try bulletListStart <|> orderedListStart + +commentBlock :: T2T Blocks +commentBlock = try (blockMarkupArea (anyLine) (const mempty) "%%%") <|> comment + +-- Seperator and Strong line treated the same +hrule :: T2T Blocks +hrule = try $ do + spaces + line <- many1 (oneOf "=-_") + guard (length line >= 20) + B.horizontalRule <$ blankline + +quote :: T2T Blocks +quote = try $ do + lookAhead tab + rawQuote <- many1 (tab *> optional spaces *> anyLine) + contents <- parseFromString parseBlocks (intercalate "\n" rawQuote ++ "\n\n") + return $ B.blockQuote contents + +commentLine :: T2T Inlines +commentLine = comment + +-- List Parsing code from Org Reader + +list :: T2T Blocks +list = choice [bulletList, orderedList, definitionList] + +bulletList :: T2T Blocks +bulletList = B.bulletList . compactify + <$> many1 (listItem bulletListStart parseBlocks) + +orderedList :: T2T Blocks +orderedList = B.orderedList . compactify + <$> many1 (listItem orderedListStart parseBlocks) + +definitionList :: T2T Blocks +definitionList = try $ do + B.definitionList . compactifyDL <$> + many1 (listItem definitionListStart definitionListEnd) + +definitionListEnd :: T2T (Inlines, [Blocks]) +definitionListEnd = (,) <$> (mconcat <$> manyTill inline newline) <*> ((:[]) <$> parseBlocks) + +genericListStart :: T2T Char + -> T2T Int +genericListStart listMarker = try $ + (2+) <$> (length <$> many spaceChar + <* listMarker <* space <* notFollowedBy space) + +-- parses bullet list \start and returns its length (excl. following whitespace) +bulletListStart :: T2T Int +bulletListStart = genericListStart (char '-') + +orderedListStart :: T2T Int +orderedListStart = genericListStart (char '+' ) + +definitionListStart :: T2T Int +definitionListStart = genericListStart (char ':') + +-- parse raw text for one list item, excluding start marker and continuations +listItem :: T2T Int + -> T2T a + -> T2T a +listItem start end = try $ do + markerLength <- try start + firstLine <- anyLineNewline + blank <- option "" ("\n" <$ blankline) + rest <- concat <$> many (listContinuation markerLength) + parseFromString end $ firstLine ++ blank ++ rest + +-- continuation of a list item - indented and separated by blankline or endline. +-- Note: nested lists are parsed as continuations. +listContinuation :: Int + -> T2T String +listContinuation markerLength = try $ + notFollowedBy' (blankline >> blankline) + *> (mappend <$> (concat <$> many1 listLine) + <*> many blankline) + where listLine = try $ indentWith markerLength *> anyLineNewline + +anyLineNewline :: T2T String +anyLineNewline = (++ "\n") <$> anyLine + +indentWith :: Int -> T2T String +indentWith n = count n space + +-- Table + +table :: T2T Blocks +table = try $ do + tableHeader <- fmap snd <$> option mempty (try headerRow) + rows <- many1 (many commentLine *> tableRow) + let columns = transpose rows + let ncolumns = length columns + let aligns = map (foldr1 findAlign) (map (map fst) columns) + let rows' = map (map snd) rows + let size = maximum (map length rows') + let rowsPadded = map (pad size) rows' + let headerPadded = if (not (null tableHeader)) then pad size tableHeader else mempty + return $ B.table mempty + (zip aligns (replicate ncolumns 0.0)) + headerPadded rowsPadded + +pad :: (Monoid a) => Int -> [a] -> [a] +pad n xs = xs ++ (replicate (n - length xs) mempty) + + +findAlign :: Alignment -> Alignment -> Alignment +findAlign x y + | x == y = x + | otherwise = AlignDefault + +headerRow :: T2T [(Alignment, Blocks)] +headerRow = genericRow (string "||") + +tableRow :: T2T [(Alignment, Blocks)] +tableRow = genericRow (char '|') + +genericRow :: T2T a -> T2T [(Alignment, Blocks)] +genericRow start = try $ do + spaces *> start + manyTill tableCell newline <?> "genericRow" + + +tableCell :: T2T (Alignment, Blocks) +tableCell = try $ do + leftSpaces <- length <$> lookAhead (many1 space) -- Case of empty cell means we must lookAhead + content <- (manyTill inline (try $ lookAhead (cellEnd))) + rightSpaces <- length <$> many space + let align = + case compare leftSpaces rightSpaces of + LT -> AlignLeft + EQ -> AlignCenter + GT -> AlignRight + endOfCell + return $ (align, B.plain (B.trimInlines $ mconcat content)) + where + cellEnd = (void newline <|> (many1 space *> endOfCell)) + +endOfCell :: T2T () +endOfCell = try (skipMany1 $ char '|') <|> ( () <$ lookAhead newline) + +-- Raw area + +verbatim :: T2T Blocks +verbatim = genericBlock anyLineNewline B.codeBlock "```" + +rawBlock :: T2T Blocks +rawBlock = genericBlock anyLineNewline (B.para . B.str) "\"\"\"" + +taggedBlock :: T2T Blocks +taggedBlock = do + target <- getTarget + genericBlock anyLineNewline (B.rawBlock target) "'''" + +-- Generic + +genericBlock :: Monoid a => T2T a -> (a -> Blocks) -> String -> T2T Blocks +genericBlock p f s = blockMarkupArea p f s <|> blockMarkupLine p f s + +blockMarkupArea :: Monoid a => (T2T a) -> (a -> Blocks) -> String -> T2T Blocks +blockMarkupArea p f s = try $ (do + string s *> blankline + f . mconcat <$> (manyTill p (eof <|> void (string s *> blankline)))) + +blockMarkupLine :: T2T a -> (a -> Blocks) -> String -> T2T Blocks +blockMarkupLine p f s = try (f <$> (string s *> space *> p)) + +-- Can be in either block or inline position +comment :: Monoid a => T2T a +comment = try $ do + atStart + notFollowedBy macro + mempty <$ (char '%' *> anyLine) + +-- Inline + +parseInlines :: T2T Inlines +parseInlines = trimInlines . mconcat <$> many1 inline + +inline :: T2T Inlines +inline = do + choice + [ endline + , macro + , commentLine + , whitespace + , url + , link + , image + , bold + , underline + , code + , raw + , tagged + , strike + , italic + , code + , str + , symbol + ] + +bold :: T2T Inlines +bold = inlineMarkup inline B.strong '*' (B.str) + +underline :: T2T Inlines +underline = inlineMarkup inline B.emph '_' (B.str) + +strike :: T2T Inlines +strike = inlineMarkup inline B.strikeout '-' (B.str) + +italic :: T2T Inlines +italic = inlineMarkup inline B.emph '/' (B.str) + +code :: T2T Inlines +code = inlineMarkup ((:[]) <$> anyChar) B.code '`' id + +raw :: T2T Inlines +raw = inlineMarkup ((:[]) <$> anyChar) B.text '"' id + +tagged :: T2T Inlines +tagged = do + target <- getTarget + inlineMarkup ((:[]) <$> anyChar) (B.rawInline target) '\'' id + +-- Parser for markup indicated by a double character. +-- Inline markup is greedy and glued +-- Greedy meaning ***a*** = Bold [Str "*a*"] +-- Glued meaning that markup must be tight to content +-- Markup can't pass newlines +inlineMarkup :: Monoid a + => (T2T a) -- Content parser + -> (a -> Inlines) -- Constructor + -> Char -- Fence + -> (String -> a) -- Special Case to handle ****** + -> T2T Inlines +inlineMarkup p f c special = try $ do + start <- many1 (char c) + let l = length start + guard (l >= 2) + when (l == 2) (void $ notFollowedBy space) + -- We must make sure that there is no space before the start of the + -- closing tags + body <- optionMaybe (try $ manyTill (noneOf "\n\r") $ + (try $ lookAhead (noneOf " " >> string [c,c] ))) + case body of + Just middle -> do + lastChar <- anyChar + end <- many1 (char c) + let parser inp = parseFromString (mconcat <$> many p) inp + let start' = case drop 2 start of + "" -> mempty + xs -> special xs + body' <- parser (middle ++ [lastChar]) + let end' = case drop 2 end of + "" -> mempty + xs -> special xs + return $ f (start' <> body' <> end') + Nothing -> do -- Either bad or case such as ***** + guard (l >= 5) + let body' = (replicate (l - 4) c) + return $ f (special body') + +link :: T2T Inlines +link = try imageLink <|> titleLink + +-- Link with title +titleLink :: T2T Inlines +titleLink = try $ do + char '[' + notFollowedBy space + tokens <- sepBy1 (many $ noneOf " ]") space + guard (length tokens >= 2) + char ']' + let link' = last tokens + guard (length link' > 0) + let tit = concat (intersperse " " (init tokens)) + return $ B.link link' "" (B.text tit) + +-- Link with image +imageLink :: T2T Inlines +imageLink = try $ do + char '[' + body <- image + many1 space + l <- manyTill (noneOf "\n\r ") (char ']') + return (B.link l "" body) + +macro :: T2T Inlines +macro = try $ do + name <- string "%%" *> oneOfStringsCI (map fst commands) + optional (try $ enclosed (char '(') (char ')') anyChar) + lookAhead (spaceChar <|> oneOf specialChars <|> newline) + maybe (return mempty) (\f -> B.str <$> asks f) (lookup name commands) + where + commands = [ ("date", date), ("mtime", mtime) + , ("infile", infile), ("outfile", outfile)] + +-- raw URLs in text are automatically linked +url :: T2T Inlines +url = try $ do + (rawUrl, escapedUrl) <- (try uri <|> emailAddress) + return $ B.link rawUrl "" (B.str escapedUrl) + +uri :: T2T (String, String) +uri = try $ do + address <- t2tURI + return (address, escapeURI address) + +-- The definition of a URI in the T2T source differs from the +-- actual definition. This is a transcription of the definition in +-- the source of v2.6 +--isT2TURI :: String -> Bool +--isT2TURI (parse t2tURI "" -> Right _) = True +--isT2TURI _ = False + +t2tURI :: T2T String +t2tURI = do + start <- try ((++) <$> proto <*> urlLogin) <|> guess + domain <- many1 chars + sep <- many (char '/') + form' <- option mempty ((:) <$> char '?' <*> many1 form) + anchor' <- option mempty ((:) <$> char '#' <*> many anchor) + return (start ++ domain ++ sep ++ form' ++ anchor') + where + protos = ["http", "https", "ftp", "telnet", "gopher", "wais"] + proto = (++) <$> oneOfStrings protos <*> string "://" + guess = (++) <$> (((++) <$> stringAnyCase "www" <*> option mempty ((:[]) <$> oneOf "23")) + <|> stringAnyCase "ftp") <*> ((:[]) <$> char '.') + login = alphaNum <|> oneOf "_.-" + pass = many (noneOf " @") + chars = alphaNum <|> oneOf "%._/~:,=$@&+-" + anchor = alphaNum <|> oneOf "%._0" + form = chars <|> oneOf ";*" + urlLogin = option mempty $ try ((\x y z -> x ++ y ++ [z]) <$> many1 login <*> option mempty ((:) <$> char ':' <*> pass) <*> char '@') + + +image :: T2T Inlines +image = try $ do + -- List taken from txt2tags source + let extensions = [".jpg", ".jpeg", ".gif", ".png", ".eps", ".bmp"] + char '[' + path <- manyTill (noneOf "\n\t\r ") (try $ lookAhead (oneOfStrings extensions)) + ext <- oneOfStrings extensions + char ']' + return $ B.image (path ++ ext) "" mempty + +-- Characters used in markup +specialChars :: String +specialChars = "%*-_/|:+;" + +tab :: T2T Char +tab = char '\t' + +space :: T2T Char +space = char ' ' + +spaces :: T2T String +spaces = many space + +endline :: T2T Inlines +endline = try $ do + newline + notFollowedBy blankline + notFollowedBy hrule + notFollowedBy title + notFollowedBy verbatim + notFollowedBy rawBlock + notFollowedBy taggedBlock + notFollowedBy quote + notFollowedBy list + notFollowedBy table + return $ B.softbreak + +str :: T2T Inlines +str = try $ do + B.str <$> many1 (noneOf $ specialChars ++ "\n\r ") + +whitespace :: T2T Inlines +whitespace = try $ B.space <$ spaceChar + +symbol :: T2T Inlines +symbol = B.str . (:[]) <$> oneOf specialChars + +-- Utility + +getTarget :: T2T String +getTarget = do + mv <- lookupMeta "target" . stateMeta <$> getState + let MetaString target = fromMaybe (MetaString "html") mv + return target + +atStart :: T2T () +atStart = (sourceColumn <$> getPosition) >>= guard . (== 1) + +ignoreSpacesCap :: T2T String -> T2T String +ignoreSpacesCap p = map toLower <$> (spaces *> p <* spaces) diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs new file mode 100644 index 000000000..85b298a85 --- /dev/null +++ b/src/Text/Pandoc/SelfContained.hs @@ -0,0 +1,181 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2011-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.SelfContained + Copyright : Copyright (C) 2011-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Functions for converting an HTML file into one that can be viewed +offline, by incorporating linked images, CSS, and scripts into +the HTML using data URIs. +-} +module Text.Pandoc.SelfContained ( makeSelfContained ) where +import Text.HTML.TagSoup +import Network.URI (isURI, escapeURIString, URI(..), parseURI) +import Data.ByteString.Base64 +import qualified Data.ByteString.Char8 as B +import Data.ByteString (ByteString) +import System.FilePath (takeExtension, takeDirectory, (</>)) +import Data.Char (toLower, isAscii, isAlphaNum) +import Codec.Compression.GZip as Gzip +import qualified Data.ByteString.Lazy as L +import Control.Monad.Trans (MonadIO(..)) +import Text.Pandoc.Shared (renderTags', err, warn, trim) +import Text.Pandoc.MediaBag (MediaBag) +import Text.Pandoc.MIME (MimeType) +import Text.Pandoc.UTF8 (toString) +import Text.Pandoc.Options (WriterOptions(..)) +import Data.List (isPrefixOf) +import Control.Applicative ((<|>)) +import Text.Parsec (runParserT, ParsecT) +import qualified Text.Parsec as P +import Control.Monad.Trans (lift) +import Text.Pandoc.Class (fetchItem, runIO, setMediaBag) + +isOk :: Char -> Bool +isOk c = isAscii c && isAlphaNum c + +makeDataURI :: String -> ByteString -> String +makeDataURI mime raw = + if textual + then "data:" ++ mime' ++ "," ++ escapeURIString isOk (toString raw) + else "data:" ++ mime' ++ ";base64," ++ toString (encode raw) + where textual = "text/" `Data.List.isPrefixOf` mime + mime' = if textual && ';' `notElem` mime + then mime ++ ";charset=utf-8" + else mime -- mime type already has charset + +convertTag :: MediaBag -> Maybe String -> Tag String -> IO (Tag String) +convertTag media sourceURL t@(TagOpen tagname as) + | tagname `elem` + ["img", "embed", "video", "input", "audio", "source", "track"] = do + as' <- mapM processAttribute as + return $ TagOpen tagname as' + where processAttribute (x,y) = + if x == "src" || x == "href" || x == "poster" + then do + enc <- getDataURI media sourceURL (fromAttrib "type" t) y + return (x, enc) + else return (x,y) +convertTag media sourceURL t@(TagOpen "script" as) = + case fromAttrib "src" t of + [] -> return t + src -> do + enc <- getDataURI media sourceURL (fromAttrib "type" t) src + return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"]) +convertTag media sourceURL t@(TagOpen "link" as) = + case fromAttrib "href" t of + [] -> return t + src -> do + enc <- getDataURI media sourceURL (fromAttrib "type" t) src + return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"]) +convertTag _ _ t = return t + +cssURLs :: MediaBag -> Maybe String -> FilePath -> ByteString + -> IO ByteString +cssURLs media sourceURL d orig = do + res <- runParserT (parseCSSUrls media sourceURL d) () "css" orig + case res of + Left e -> warn ("Could not parse CSS: " ++ show e) >> return orig + Right bs -> return bs + +parseCSSUrls :: MediaBag -> Maybe String -> FilePath + -> ParsecT ByteString () IO ByteString +parseCSSUrls media sourceURL d = B.concat <$> P.many + (pCSSWhite <|> pCSSComment <|> pCSSUrl media sourceURL d <|> pCSSOther) + +-- Note: some whitespace in CSS is significant, so we can't collapse it! +pCSSWhite :: ParsecT ByteString () IO ByteString +pCSSWhite = B.singleton <$> P.space <* P.spaces + +pCSSComment :: ParsecT ByteString () IO ByteString +pCSSComment = P.try $ do + P.string "/*" + P.manyTill P.anyChar (P.try (P.string "*/")) + return B.empty + +pCSSOther :: ParsecT ByteString () IO ByteString +pCSSOther = do + (B.pack <$> P.many1 (P.noneOf "u/ \n\r\t")) <|> + (B.singleton <$> P.char 'u') <|> + (B.singleton <$> P.char '/') + +pCSSUrl :: MediaBag -> Maybe String -> FilePath + -> ParsecT ByteString () IO ByteString +pCSSUrl media sourceURL d = P.try $ do + P.string "url(" + P.spaces + quote <- P.option Nothing (Just <$> P.oneOf "\"'") + url <- P.manyTill P.anyChar (maybe (P.lookAhead (P.char ')')) P.char quote) + P.spaces + P.char ')' + let fallback = B.pack ("url(" ++ maybe "" (:[]) quote ++ trim url ++ + maybe "" (:[]) quote ++ ")") + case trim url of + '#':_ -> return fallback + 'd':'a':'t':'a':':':_ -> return fallback + u -> do let url' = if isURI u then u else d </> u + enc <- lift $ getDataURI media sourceURL "" url' + return (B.pack $ "url(" ++ enc ++ ")") + + +getDataURI :: MediaBag -> Maybe String -> MimeType -> String + -> IO String +getDataURI _ _ _ src@('d':'a':'t':'a':':':_) = return src -- already data: uri +getDataURI media sourceURL mimetype src = do + let ext = map toLower $ takeExtension src + fetchResult <- runIO $ do setMediaBag media + fetchItem sourceURL src + (raw, respMime) <- case fetchResult of + Left msg -> err 67 $ "Could not fetch " ++ src ++ + "\n" ++ show msg + Right x -> return x + let raw' = if ext == ".gz" + then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks + $ [raw] + else raw + let mime = case (mimetype, respMime) of + ("",Nothing) -> error + $ "Could not determine mime type for `" ++ src ++ "'" + (x, Nothing) -> x + (_, Just x ) -> x + let cssSourceURL = case parseURI src of + Just u + | uriScheme u `elem` ["http:","https:"] -> + Just $ show u{ uriPath = "", + uriQuery = "", + uriFragment = "" } + _ -> Nothing + result <- if mime == "text/css" + then cssURLs media cssSourceURL (takeDirectory src) raw' + else return raw' + return $ makeDataURI mime result + +-- | Convert HTML into self-contained HTML, incorporating images, +-- scripts, and CSS using data: URIs. +makeSelfContained :: MonadIO m => WriterOptions -> MediaBag -> String -> m String +makeSelfContained opts mediabag inp = liftIO $ do + let tags = parseTags inp + out' <- mapM (convertTag mediabag (writerSourceURL opts)) tags + return $ renderTags' out' diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs new file mode 100644 index 000000000..268a5052e --- /dev/null +++ b/src/Text/Pandoc/Shared.hs @@ -0,0 +1,883 @@ +{-# LANGUAGE DeriveDataTypeable, CPP, MultiParamTypeClasses, + FlexibleContexts, ScopedTypeVariables, PatternGuards, + ViewPatterns #-} +{- +Copyright (C) 2006-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Shared + Copyright : Copyright (C) 2006-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Utility functions and definitions used by the various Pandoc modules. +-} +module Text.Pandoc.Shared ( + -- * List processing + splitBy, + splitByIndices, + splitStringByIndices, + substitute, + ordNub, + -- * Text processing + backslashEscapes, + escapeStringUsing, + stripTrailingNewlines, + trim, + triml, + trimr, + stripFirstAndLast, + camelCaseToHyphenated, + toRomanNumeral, + escapeURI, + tabFilter, + -- * Date/time + normalizeDate, + -- * Pandoc block and inline list processing + orderedListMarkers, + normalizeSpaces, + extractSpaces, + removeFormatting, + deNote, + stringify, + capitalize, + compactify, + compactifyDL, + linesToPara, + Element (..), + hierarchicalize, + uniqueIdent, + inlineListToIdentifier, + isHeaderBlock, + headerShift, + isTightList, + addMetaField, + makeMeta, + -- * TagSoup HTML handling + renderTags', + -- * File handling + inDirectory, + getDefaultReferenceDocx, + getDefaultReferenceODT, + readDataFile, + readDataFileUTF8, + openURL, + collapseFilePath, + filteredFilesFromArchive, + -- * Error handling + err, + warn, + mapLeft, + -- * for squashing blocks + blocksToInlines, + -- * Safe read + safeRead, + -- * Temp directory + withTempDir, + -- * Version + pandocVersion + ) where + +import Text.Pandoc.Definition +import Text.Pandoc.Walk +import Text.Pandoc.Builder (Inlines, Blocks, ToMetaValue(..)) +import qualified Text.Pandoc.Builder as B +import qualified Text.Pandoc.UTF8 as UTF8 +import System.Exit (exitWith, ExitCode(..)) +import Data.Char ( toLower, isLower, isUpper, isAlpha, + isLetter, isDigit, isSpace ) +import Data.List ( find, stripPrefix, intercalate ) +import Data.Maybe (mapMaybe) +import Data.Version ( showVersion ) +import qualified Data.Map as M +import Network.URI ( escapeURIString, unEscapeString ) +import qualified Data.Set as Set +import System.Directory +import System.FilePath (splitDirectories, isPathSeparator) +import qualified System.FilePath.Posix as Posix +import Text.Pandoc.MIME (MimeType) +import System.FilePath ( (</>) ) +import Data.Generics (Typeable, Data) +import qualified Control.Monad.State as S +import Control.Monad.Trans (MonadIO (..)) +import qualified Control.Exception as E +import Control.Monad (msum, unless, MonadPlus(..)) +import Text.Pandoc.Pretty (charWidth) +import Text.Pandoc.Compat.Time +import Data.Time.Clock.POSIX +import System.IO (stderr) +import System.IO.Temp +import Text.HTML.TagSoup (renderTagsOptions, RenderOptions(..), Tag(..), + renderOptions) +import Data.Monoid ((<>)) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Char8 as B8 +import Data.ByteString.Base64 (decodeLenient) +import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) +import qualified Data.Text as T (toUpper, pack, unpack) +import Data.ByteString.Lazy (toChunks, fromChunks) +import qualified Data.ByteString.Lazy as BL +import Paths_pandoc (version) + +import Codec.Archive.Zip + +#ifdef EMBED_DATA_FILES +import Text.Pandoc.Data (dataFiles) +#else +import Paths_pandoc (getDataFileName) +#endif +#ifdef HTTP_CLIENT +import Network.HTTP.Client (httpLbs, responseBody, responseHeaders, + Request(port,host,requestHeaders)) +import Network.HTTP.Client (parseRequest) +import Network.HTTP.Client (newManager) +import Network.HTTP.Client.Internal (addProxy) +import Network.HTTP.Client.TLS (tlsManagerSettings) +import System.Environment (getEnv) +import Network.HTTP.Types.Header ( hContentType, hUserAgent) +import Network (withSocketsDo) +#else +import Network.URI (parseURI) +import Network.HTTP (findHeader, rspBody, + RequestMethod(..), HeaderName(..), mkRequest) +import Network.Browser (browse, setAllowRedirects, setOutHandler, request) +#endif + +-- | Version number of pandoc library. +pandocVersion :: String +pandocVersion = showVersion version + +-- +-- List processing +-- + +-- | Split list by groups of one or more sep. +splitBy :: (a -> Bool) -> [a] -> [[a]] +splitBy _ [] = [] +splitBy isSep lst = + let (first, rest) = break isSep lst + rest' = dropWhile isSep rest + in first:(splitBy isSep rest') + +splitByIndices :: [Int] -> [a] -> [[a]] +splitByIndices [] lst = [lst] +splitByIndices (x:xs) lst = first:(splitByIndices (map (\y -> y - x) xs) rest) + where (first, rest) = splitAt x lst + +-- | Split string into chunks divided at specified indices. +splitStringByIndices :: [Int] -> [Char] -> [[Char]] +splitStringByIndices [] lst = [lst] +splitStringByIndices (x:xs) lst = + let (first, rest) = splitAt' x lst in + first : (splitStringByIndices (map (\y -> y - x) xs) rest) + +splitAt' :: Int -> [Char] -> ([Char],[Char]) +splitAt' _ [] = ([],[]) +splitAt' n xs | n <= 0 = ([],xs) +splitAt' n (x:xs) = (x:ys,zs) + where (ys,zs) = splitAt' (n - charWidth x) xs + +-- | Replace each occurrence of one sublist in a list with another. +substitute :: (Eq a) => [a] -> [a] -> [a] -> [a] +substitute _ _ [] = [] +substitute [] _ xs = xs +substitute target replacement lst@(x:xs) = + case stripPrefix target lst of + Just lst' -> replacement ++ substitute target replacement lst' + Nothing -> x : substitute target replacement xs + +ordNub :: (Ord a) => [a] -> [a] +ordNub l = go Set.empty l + where + go _ [] = [] + go s (x:xs) = if x `Set.member` s then go s xs + else x : go (Set.insert x s) xs + +-- +-- Text processing +-- + +-- | Returns an association list of backslash escapes for the +-- designated characters. +backslashEscapes :: [Char] -- ^ list of special characters to escape + -> [(Char, String)] +backslashEscapes = map (\ch -> (ch, ['\\',ch])) + +-- | Escape a string of characters, using an association list of +-- characters and strings. +escapeStringUsing :: [(Char, String)] -> String -> String +escapeStringUsing _ [] = "" +escapeStringUsing escapeTable (x:xs) = + case (lookup x escapeTable) of + Just str -> str ++ rest + Nothing -> x:rest + where rest = escapeStringUsing escapeTable xs + +-- | Strip trailing newlines from string. +stripTrailingNewlines :: String -> String +stripTrailingNewlines = reverse . dropWhile (== '\n') . reverse + +-- | Remove leading and trailing space (including newlines) from string. +trim :: String -> String +trim = triml . trimr + +-- | Remove leading space (including newlines) from string. +triml :: String -> String +triml = dropWhile (`elem` " \r\n\t") + +-- | Remove trailing space (including newlines) from string. +trimr :: String -> String +trimr = reverse . triml . reverse + +-- | Strip leading and trailing characters from string +stripFirstAndLast :: String -> String +stripFirstAndLast str = + drop 1 $ take ((length str) - 1) str + +-- | Change CamelCase word to hyphenated lowercase (e.g., camel-case). +camelCaseToHyphenated :: String -> String +camelCaseToHyphenated [] = "" +camelCaseToHyphenated (a:b:rest) | isLower a && isUpper b = + a:'-':(toLower b):(camelCaseToHyphenated rest) +camelCaseToHyphenated (a:rest) = (toLower a):(camelCaseToHyphenated rest) + +-- | Convert number < 4000 to uppercase roman numeral. +toRomanNumeral :: Int -> String +toRomanNumeral x + | x >= 4000 || x < 0 = "?" + | x >= 1000 = "M" ++ toRomanNumeral (x - 1000) + | x >= 900 = "CM" ++ toRomanNumeral (x - 900) + | x >= 500 = "D" ++ toRomanNumeral (x - 500) + | x >= 400 = "CD" ++ toRomanNumeral (x - 400) + | x >= 100 = "C" ++ toRomanNumeral (x - 100) + | x >= 90 = "XC" ++ toRomanNumeral (x - 90) + | x >= 50 = "L" ++ toRomanNumeral (x - 50) + | x >= 40 = "XL" ++ toRomanNumeral (x - 40) + | x >= 10 = "X" ++ toRomanNumeral (x - 10) + | x == 9 = "IX" + | x >= 5 = "V" ++ toRomanNumeral (x - 5) + | x == 4 = "IV" + | x >= 1 = "I" ++ toRomanNumeral (x - 1) + | otherwise = "" + +-- | Escape whitespace and some punctuation characters in URI. +escapeURI :: String -> String +escapeURI = escapeURIString (not . needsEscaping) + where needsEscaping c = isSpace c || c `elem` + ['<','>','|','"','{','}','[',']','^', '`'] + + +-- | Convert tabs to spaces and filter out DOS line endings. +-- Tabs will be preserved if tab stop is set to 0. +tabFilter :: Int -- ^ Tab stop + -> String -- ^ Input + -> String +tabFilter tabStop = + let go _ [] = "" + go _ ('\n':xs) = '\n' : go tabStop xs + go _ ('\r':'\n':xs) = '\n' : go tabStop xs + go _ ('\r':xs) = '\n' : go tabStop xs + go spsToNextStop ('\t':xs) = + if tabStop == 0 + then '\t' : go tabStop xs + else replicate spsToNextStop ' ' ++ go tabStop xs + go 1 (x:xs) = + x : go tabStop xs + go spsToNextStop (x:xs) = + x : go (spsToNextStop - 1) xs + in go tabStop + +-- +-- Date/time +-- + +-- | Parse a date and convert (if possible) to "YYYY-MM-DD" format. We +-- limit years to the range 1601-9999 (ISO 8601 accepts greater than +-- or equal to 1583, but MS Word only accepts dates starting 1601). +normalizeDate :: String -> Maybe String +normalizeDate s = fmap (formatTime defaultTimeLocale "%F") + (msum $ map (\fs -> parsetimeWith fs s >>= rejectBadYear) formats :: Maybe Day) + where rejectBadYear day = case toGregorian day of + (y, _, _) | y >= 1601 && y <= 9999 -> Just day + _ -> Nothing + parsetimeWith = +#if MIN_VERSION_time(1,5,0) + parseTimeM True defaultTimeLocale +#else + parseTime defaultTimeLocale +#endif + formats = ["%x","%m/%d/%Y", "%D","%F", "%d %b %Y", + "%d %B %Y", "%b. %d, %Y", "%B %d, %Y", + "%Y%m%d", "%Y%m", "%Y"] + +-- +-- Pandoc block and inline list processing +-- + +-- | Generate infinite lazy list of markers for an ordered list, +-- depending on list attributes. +orderedListMarkers :: (Int, ListNumberStyle, ListNumberDelim) -> [String] +orderedListMarkers (start, numstyle, numdelim) = + let singleton c = [c] + nums = case numstyle of + DefaultStyle -> map show [start..] + Example -> map show [start..] + Decimal -> map show [start..] + UpperAlpha -> drop (start - 1) $ cycle $ + map singleton ['A'..'Z'] + LowerAlpha -> drop (start - 1) $ cycle $ + map singleton ['a'..'z'] + UpperRoman -> map toRomanNumeral [start..] + LowerRoman -> map (map toLower . toRomanNumeral) [start..] + inDelim str = case numdelim of + DefaultDelim -> str ++ "." + Period -> str ++ "." + OneParen -> str ++ ")" + TwoParens -> "(" ++ str ++ ")" + in map inDelim nums + +-- | Normalize a list of inline elements: remove leading and trailing +-- @Space@ elements, collapse double @Space@s into singles, and +-- remove empty Str elements. +normalizeSpaces :: [Inline] -> [Inline] +normalizeSpaces = cleanup . dropWhile isSpaceOrEmpty + where cleanup [] = [] + cleanup (Space:rest) = case dropWhile isSpaceOrEmpty rest of + [] -> [] + (x:xs) -> Space : x : cleanup xs + cleanup ((Str ""):rest) = cleanup rest + cleanup (x:rest) = x : cleanup rest + +isSpaceOrEmpty :: Inline -> Bool +isSpaceOrEmpty Space = True +isSpaceOrEmpty (Str "") = True +isSpaceOrEmpty _ = False + +-- | Extract the leading and trailing spaces from inside an inline element +-- and place them outside the element. SoftBreaks count as Spaces for +-- these purposes. +extractSpaces :: (Inlines -> Inlines) -> Inlines -> Inlines +extractSpaces f is = + let contents = B.unMany is + left = case viewl contents of + (Space :< _) -> B.space + (SoftBreak :< _) -> B.softbreak + _ -> mempty + right = case viewr contents of + (_ :> Space) -> B.space + (_ :> SoftBreak) -> B.softbreak + _ -> mempty in + (left <> f (B.trimInlines . B.Many $ contents) <> right) + +-- | Extract inlines, removing formatting. +removeFormatting :: Walkable Inline a => a -> [Inline] +removeFormatting = query go . walk deNote + where go :: Inline -> [Inline] + go (Str xs) = [Str xs] + go Space = [Space] + go SoftBreak = [SoftBreak] + go (Code _ x) = [Str x] + go (Math _ x) = [Str x] + go LineBreak = [Space] + go _ = [] + +deNote :: Inline -> Inline +deNote (Note _) = Str "" +deNote x = x + +-- | Convert pandoc structure to a string with formatting removed. +-- Footnotes are skipped (since we don't want their contents in link +-- labels). +stringify :: Walkable Inline a => a -> String +stringify = query go . walk deNote + where go :: Inline -> [Char] + go Space = " " + go SoftBreak = " " + go (Str x) = x + go (Code _ x) = x + go (Math _ x) = x + go (RawInline (Format "html") ('<':'b':'r':_)) = " " -- see #2105 + go LineBreak = " " + go _ = "" + +-- | Bring all regular text in a pandoc structure to uppercase. +-- +-- This function correctly handles cases where a lowercase character doesn't +-- match to a single uppercase character – e.g. “Straße” would be converted +-- to “STRASSE”, not “STRAßE”. +capitalize :: Walkable Inline a => a -> a +capitalize = walk go + where go :: Inline -> Inline + go (Str s) = Str (T.unpack $ T.toUpper $ T.pack s) + go x = x + +-- | Change final list item from @Para@ to @Plain@ if the list contains +-- no other @Para@ blocks. Like compactify, but operates on @Blocks@ rather +-- than @[Block]@. +compactify :: [Blocks] -- ^ List of list items (each a list of blocks) + -> [Blocks] +compactify [] = [] +compactify items = + let (others, final) = (init items, last items) + in case reverse (B.toList final) of + (Para a:xs) -> case [Para x | Para x <- concatMap B.toList items] of + -- if this is only Para, change to Plain + [_] -> others ++ [B.fromList (reverse $ Plain a : xs)] + _ -> items + _ -> items + +-- | Like @compactify@, but acts on items of definition lists. +compactifyDL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])] +compactifyDL items = + let defs = concatMap snd items + in case reverse (concatMap B.toList defs) of + (Para x:xs) + | not (any isPara xs) -> + let (t,ds) = last items + lastDef = B.toList $ last ds + ds' = init ds ++ + if null lastDef + then [B.fromList lastDef] + else [B.fromList $ init lastDef ++ [Plain x]] + in init items ++ [(t, ds')] + | otherwise -> items + _ -> items + +-- | Combine a list of lines by adding hard linebreaks. +combineLines :: [[Inline]] -> [Inline] +combineLines = intercalate [LineBreak] + +-- | Convert a list of lines into a paragraph with hard line breaks. This is +-- useful e.g. for rudimentary support of LineBlock elements in writers. +linesToPara :: [[Inline]] -> Block +linesToPara = Para . combineLines + +isPara :: Block -> Bool +isPara (Para _) = True +isPara _ = False + +-- | Data structure for defining hierarchical Pandoc documents +data Element = Blk Block + | Sec Int [Int] Attr [Inline] [Element] + -- lvl num attributes label contents + deriving (Eq, Read, Show, Typeable, Data) + +instance Walkable Inline Element where + walk f (Blk x) = Blk (walk f x) + walk f (Sec lev nums attr ils elts) = Sec lev nums attr (walk f ils) (walk f elts) + walkM f (Blk x) = Blk `fmap` walkM f x + walkM f (Sec lev nums attr ils elts) = do + ils' <- walkM f ils + elts' <- walkM f elts + return $ Sec lev nums attr ils' elts' + query f (Blk x) = query f x + query f (Sec _ _ _ ils elts) = query f ils <> query f elts + +instance Walkable Block Element where + walk f (Blk x) = Blk (walk f x) + walk f (Sec lev nums attr ils elts) = Sec lev nums attr (walk f ils) (walk f elts) + walkM f (Blk x) = Blk `fmap` walkM f x + walkM f (Sec lev nums attr ils elts) = do + ils' <- walkM f ils + elts' <- walkM f elts + return $ Sec lev nums attr ils' elts' + query f (Blk x) = query f x + query f (Sec _ _ _ ils elts) = query f ils <> query f elts + + +-- | Convert Pandoc inline list to plain text identifier. HTML +-- identifiers must start with a letter, and may contain only +-- letters, digits, and the characters _-. +inlineListToIdentifier :: [Inline] -> String +inlineListToIdentifier = + dropWhile (not . isAlpha) . intercalate "-" . words . + map (nbspToSp . toLower) . + filter (\c -> isLetter c || isDigit c || c `elem` "_-. ") . + stringify + where nbspToSp '\160' = ' ' + nbspToSp x = x + +-- | Convert list of Pandoc blocks into (hierarchical) list of Elements +hierarchicalize :: [Block] -> [Element] +hierarchicalize blocks = S.evalState (hierarchicalizeWithIds blocks) [] + +hierarchicalizeWithIds :: [Block] -> S.State [Int] [Element] +hierarchicalizeWithIds [] = return [] +hierarchicalizeWithIds ((Header level attr@(_,classes,_) title'):xs) = do + lastnum <- S.get + let lastnum' = take level lastnum + let newnum = case length lastnum' of + x | "unnumbered" `elem` classes -> [] + | x >= level -> init lastnum' ++ [last lastnum' + 1] + | otherwise -> lastnum ++ + replicate (level - length lastnum - 1) 0 ++ [1] + unless (null newnum) $ S.put newnum + let (sectionContents, rest) = break (headerLtEq level) xs + sectionContents' <- hierarchicalizeWithIds sectionContents + rest' <- hierarchicalizeWithIds rest + return $ Sec level newnum attr title' sectionContents' : rest' +hierarchicalizeWithIds ((Div ("",["references"],[]) + (Header level (ident,classes,kvs) title' : xs)):ys) = + hierarchicalizeWithIds ((Header level (ident,("references":classes),kvs) + title') : (xs ++ ys)) +hierarchicalizeWithIds (x:rest) = do + rest' <- hierarchicalizeWithIds rest + return $ (Blk x) : rest' + +headerLtEq :: Int -> Block -> Bool +headerLtEq level (Header l _ _) = l <= level +headerLtEq level (Div ("",["references"],[]) (Header l _ _ : _)) = l <= level +headerLtEq _ _ = False + +-- | Generate a unique identifier from a list of inlines. +-- Second argument is a list of already used identifiers. +uniqueIdent :: [Inline] -> Set.Set String -> String +uniqueIdent title' usedIdents + = let baseIdent = case inlineListToIdentifier title' of + "" -> "section" + x -> x + numIdent n = baseIdent ++ "-" ++ show n + in if baseIdent `Set.member` usedIdents + then case find (\x -> not $ numIdent x `Set.member` usedIdents) ([1..60000] :: [Int]) of + Just x -> numIdent x + Nothing -> baseIdent -- if we have more than 60,000, allow repeats + else baseIdent + +-- | True if block is a Header block. +isHeaderBlock :: Block -> Bool +isHeaderBlock (Header _ _ _) = True +isHeaderBlock _ = False + +-- | Shift header levels up or down. +headerShift :: Int -> Pandoc -> Pandoc +headerShift n = walk shift + where shift :: Block -> Block + shift (Header level attr inner) = Header (level + n) attr inner + shift x = x + +-- | Detect if a list is tight. +isTightList :: [[Block]] -> Bool +isTightList = all firstIsPlain + where firstIsPlain (Plain _ : _) = True + firstIsPlain _ = False + +-- | Set a field of a 'Meta' object. If the field already has a value, +-- convert it into a list with the new value appended to the old value(s). +addMetaField :: ToMetaValue a + => String + -> a + -> Meta + -> Meta +addMetaField key val (Meta meta) = + Meta $ M.insertWith combine key (toMetaValue val) meta + where combine newval (MetaList xs) = MetaList (xs ++ tolist newval) + combine newval x = MetaList [x, newval] + tolist (MetaList ys) = ys + tolist y = [y] + +-- | Create 'Meta' from old-style title, authors, date. This is +-- provided to ease the transition from the old API. +makeMeta :: [Inline] -> [[Inline]] -> [Inline] -> Meta +makeMeta title authors date = + addMetaField "title" (B.fromList title) + $ addMetaField "author" (map B.fromList authors) + $ addMetaField "date" (B.fromList date) + $ nullMeta + +-- +-- TagSoup HTML handling +-- + +-- | Render HTML tags. +renderTags' :: [Tag String] -> String +renderTags' = renderTagsOptions + renderOptions{ optMinimize = matchTags ["hr", "br", "img", + "meta", "link"] + , optRawTag = matchTags ["script", "style"] } + where matchTags = \tags -> flip elem tags . map toLower + +-- +-- File handling +-- + +-- | Perform an IO action in a directory, returning to starting directory. +inDirectory :: FilePath -> IO a -> IO a +inDirectory path action = E.bracket + getCurrentDirectory + setCurrentDirectory + (const $ setCurrentDirectory path >> action) + +getDefaultReferenceDocx :: Maybe FilePath -> IO Archive +getDefaultReferenceDocx datadir = do + let paths = ["[Content_Types].xml", + "_rels/.rels", + "docProps/app.xml", + "docProps/core.xml", + "word/document.xml", + "word/fontTable.xml", + "word/footnotes.xml", + "word/numbering.xml", + "word/settings.xml", + "word/webSettings.xml", + "word/styles.xml", + "word/_rels/document.xml.rels", + "word/_rels/footnotes.xml.rels", + "word/theme/theme1.xml"] + let toLazy = fromChunks . (:[]) + let pathToEntry path = do epochtime <- (floor . utcTimeToPOSIXSeconds) <$> + getCurrentTime + contents <- toLazy <$> readDataFile datadir + ("docx/" ++ path) + return $ toEntry path epochtime contents + mbArchive <- case datadir of + Nothing -> return Nothing + Just d -> do + exists <- doesFileExist (d </> "reference.docx") + if exists + then return (Just (d </> "reference.docx")) + else return Nothing + case mbArchive of + Just arch -> toArchive <$> BL.readFile arch + Nothing -> foldr addEntryToArchive emptyArchive <$> + mapM pathToEntry paths + +getDefaultReferenceODT :: Maybe FilePath -> IO Archive +getDefaultReferenceODT datadir = do + let paths = ["mimetype", + "manifest.rdf", + "styles.xml", + "content.xml", + "meta.xml", + "settings.xml", + "Configurations2/accelerator/current.xml", + "Thumbnails/thumbnail.png", + "META-INF/manifest.xml"] + let pathToEntry path = do epochtime <- floor `fmap` getPOSIXTime + contents <- (fromChunks . (:[])) `fmap` + readDataFile datadir ("odt/" ++ path) + return $ toEntry path epochtime contents + mbArchive <- case datadir of + Nothing -> return Nothing + Just d -> do + exists <- doesFileExist (d </> "reference.odt") + if exists + then return (Just (d </> "reference.odt")) + else return Nothing + case mbArchive of + Just arch -> toArchive <$> BL.readFile arch + Nothing -> foldr addEntryToArchive emptyArchive <$> + mapM pathToEntry paths + + +readDefaultDataFile :: FilePath -> IO BS.ByteString +readDefaultDataFile "reference.docx" = + (BS.concat . toChunks . fromArchive) <$> getDefaultReferenceDocx Nothing +readDefaultDataFile "reference.odt" = + (BS.concat . toChunks . fromArchive) <$> getDefaultReferenceODT Nothing +readDefaultDataFile fname = +#ifdef EMBED_DATA_FILES + case lookup (makeCanonical fname) dataFiles of + Nothing -> err 97 $ "Could not find data file " ++ fname + Just contents -> return contents + where makeCanonical = Posix.joinPath . transformPathParts . splitDirectories + transformPathParts = reverse . foldl go [] + go as "." = as + go (_:as) ".." = as + go as x = x : as +#else + getDataFileName fname' >>= checkExistence >>= BS.readFile + where fname' = if fname == "MANUAL.txt" then fname else "data" </> fname + +checkExistence :: FilePath -> IO FilePath +checkExistence fn = do + exists <- doesFileExist fn + if exists + then return fn + else err 97 ("Could not find data file " ++ fn) +#endif + +-- | Read file from specified user data directory or, if not found there, from +-- Cabal data directory. +readDataFile :: Maybe FilePath -> FilePath -> IO BS.ByteString +readDataFile Nothing fname = readDefaultDataFile fname +readDataFile (Just userDir) fname = do + exists <- doesFileExist (userDir </> fname) + if exists + then BS.readFile (userDir </> fname) + else readDefaultDataFile fname + +-- | Same as 'readDataFile' but returns a String instead of a ByteString. +readDataFileUTF8 :: Maybe FilePath -> FilePath -> IO String +readDataFileUTF8 userDir fname = + UTF8.toString `fmap` readDataFile userDir fname + +-- | Read from a URL and return raw data and maybe mime type. +openURL :: String -> IO (Either E.SomeException (BS.ByteString, Maybe MimeType)) +openURL u + | Just u'' <- stripPrefix "data:" u = + let mime = takeWhile (/=',') u'' + contents = B8.pack $ unEscapeString $ drop 1 $ dropWhile (/=',') u'' + in return $ Right (decodeLenient contents, Just mime) +#ifdef HTTP_CLIENT + | otherwise = withSocketsDo $ E.try $ do + let parseReq = parseRequest + (proxy :: Either E.SomeException String) <- E.try $ getEnv "http_proxy" + (useragent :: Either E.SomeException String) <- E.try $ getEnv "USER_AGENT" + req <- parseReq u + req' <- case proxy of + Left _ -> return req + Right pr -> (parseReq pr >>= \r -> + return $ addProxy (host r) (port r) req) + `mplus` return req + req'' <- case useragent of + Left _ -> return req' + Right ua -> do + let headers = requestHeaders req' + let useragentheader = (hUserAgent, B8.pack ua) + let headers' = useragentheader:headers + return $ req' {requestHeaders = headers'} + resp <- newManager tlsManagerSettings >>= httpLbs req'' + return (BS.concat $ toChunks $ responseBody resp, + UTF8.toString `fmap` lookup hContentType (responseHeaders resp)) +#else + | otherwise = E.try $ getBodyAndMimeType `fmap` browse + (do liftIO $ UTF8.hPutStrLn stderr $ "Fetching " ++ u ++ "..." + setOutHandler $ const (return ()) + setAllowRedirects True + request (getRequest' u')) + where getBodyAndMimeType (_, r) = (rspBody r, findHeader HdrContentType r) + getRequest' uriString = case parseURI uriString of + Nothing -> error ("Not a valid URL: " ++ + uriString) + Just v -> mkRequest GET v + u' = escapeURIString (/= '|') u -- pipes are rejected by Network.URI +#endif + +-- +-- Error reporting +-- + +err :: MonadIO m => Int -> String -> m a +err exitCode msg = liftIO $ do + UTF8.hPutStrLn stderr msg + exitWith $ ExitFailure exitCode + return undefined + +warn :: MonadIO m => String -> m () +warn msg = liftIO $ do + UTF8.hPutStrLn stderr $ "[warning] " ++ msg + +mapLeft :: (a -> b) -> Either a c -> Either b c +mapLeft f (Left x) = Left (f x) +mapLeft _ (Right x) = Right x + +-- | Remove intermediate "." and ".." directories from a path. +-- +-- > collapseFilePath "./foo" == "foo" +-- > collapseFilePath "/bar/../baz" == "/baz" +-- > collapseFilePath "/../baz" == "/../baz" +-- > collapseFilePath "parent/foo/baz/../bar" == "parent/foo/bar" +-- > collapseFilePath "parent/foo/baz/../../bar" == "parent/bar" +-- > collapseFilePath "parent/foo/.." == "parent" +-- > collapseFilePath "/parent/foo/../../bar" == "/bar" +collapseFilePath :: FilePath -> FilePath +collapseFilePath = Posix.joinPath . reverse . foldl go [] . splitDirectories + where + go rs "." = rs + go r@(p:rs) ".." = case p of + ".." -> ("..":r) + (checkPathSeperator -> Just True) -> ("..":r) + _ -> rs + go _ (checkPathSeperator -> Just True) = [[Posix.pathSeparator]] + go rs x = x:rs + isSingleton [] = Nothing + isSingleton [x] = Just x + isSingleton _ = Nothing + checkPathSeperator = fmap isPathSeparator . isSingleton + +-- +-- File selection from the archive +-- +filteredFilesFromArchive :: Archive -> (FilePath -> Bool) -> [(FilePath, BL.ByteString)] +filteredFilesFromArchive zf f = + mapMaybe (fileAndBinary zf) (filter f (filesInArchive zf)) + where + fileAndBinary :: Archive -> FilePath -> Maybe (FilePath, BL.ByteString) + fileAndBinary a fp = findEntryByPath fp a >>= \e -> Just (fp, fromEntry e) + +--- +--- Squash blocks into inlines +--- + +blockToInlines :: Block -> [Inline] +blockToInlines (Plain ils) = ils +blockToInlines (Para ils) = ils +blockToInlines (LineBlock lns) = combineLines lns +blockToInlines (CodeBlock attr str) = [Code attr str] +blockToInlines (RawBlock fmt str) = [RawInline fmt str] +blockToInlines (BlockQuote blks) = blocksToInlines blks +blockToInlines (OrderedList _ blkslst) = + concatMap blocksToInlines blkslst +blockToInlines (BulletList blkslst) = + concatMap blocksToInlines blkslst +blockToInlines (DefinitionList pairslst) = + concatMap f pairslst + where + f (ils, blkslst) = ils ++ + [Str ":", Space] ++ + (concatMap blocksToInlines blkslst) +blockToInlines (Header _ _ ils) = ils +blockToInlines (HorizontalRule) = [] +blockToInlines (Table _ _ _ headers rows) = + intercalate [LineBreak] $ map (concatMap blocksToInlines) tbl + where + tbl = headers : rows +blockToInlines (Div _ blks) = blocksToInlines blks +blockToInlines Null = [] + +blocksToInlinesWithSep :: [Inline] -> [Block] -> [Inline] +blocksToInlinesWithSep sep blks = intercalate sep $ map blockToInlines blks + +blocksToInlines :: [Block] -> [Inline] +blocksToInlines = blocksToInlinesWithSep [Space, Str "¶", Space] + + +-- +-- Safe read +-- + +safeRead :: (MonadPlus m, Read a) => String -> m a +safeRead s = case reads s of + (d,x):_ + | all isSpace x -> return d + _ -> mzero + +-- +-- Temp directory +-- + +withTempDir :: String -> (FilePath -> IO a) -> IO a +withTempDir = +#ifdef _WINDOWS + withTempDirectory "." +#else + withSystemTempDirectory +#endif diff --git a/src/Text/Pandoc/Slides.hs b/src/Text/Pandoc/Slides.hs new file mode 100644 index 000000000..e19dba3e2 --- /dev/null +++ b/src/Text/Pandoc/Slides.hs @@ -0,0 +1,63 @@ +{- +Copyright (C) 2012-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Slides + Copyright : Copyright (C) 2012-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Utility functions for splitting documents into slides for slide +show formats (dzslides, revealjs, s5, slidy, slideous, beamer). +-} +module Text.Pandoc.Slides ( getSlideLevel, prepSlides ) where +import Text.Pandoc.Definition + +-- | Find level of header that starts slides (defined as the least header +-- level that occurs before a non-header/non-hrule in the blocks). +getSlideLevel :: [Block] -> Int +getSlideLevel = go 6 + where go least (Header n _ _ : x : xs) + | n < least && nonHOrHR x = go n xs + | otherwise = go least (x:xs) + go least (_ : xs) = go least xs + go least [] = least + nonHOrHR (Header{}) = False + nonHOrHR (HorizontalRule) = False + nonHOrHR _ = True + +-- | Prepare a block list to be passed to hierarchicalize. +prepSlides :: Int -> [Block] -> [Block] +prepSlides slideLevel = ensureStartWithH . splitHrule . extractRefsHeader + where splitHrule (HorizontalRule : Header n attr xs : ys) + | n == slideLevel = Header slideLevel attr xs : splitHrule ys + splitHrule (HorizontalRule : xs) = Header slideLevel nullAttr [Str "\0"] : + splitHrule xs + splitHrule (x : xs) = x : splitHrule xs + splitHrule [] = [] + extractRefsHeader bs = + case reverse bs of + (Div ("",["references"],[]) (Header n attrs xs : ys) : zs) + -> reverse zs ++ (Header n attrs xs : [Div ("",["references"],[]) ys]) + _ -> bs + ensureStartWithH bs@(Header n _ _:_) + | n <= slideLevel = bs + ensureStartWithH bs = Header slideLevel nullAttr [Str "\0"] : bs diff --git a/src/Text/Pandoc/Templates.hs b/src/Text/Pandoc/Templates.hs new file mode 100644 index 000000000..705ac54c9 --- /dev/null +++ b/src/Text/Pandoc/Templates.hs @@ -0,0 +1,77 @@ +{-# LANGUAGE TypeSynonymInstances, FlexibleInstances, + OverloadedStrings, GeneralizedNewtypeDeriving #-} +{- +Copyright (C) 2009-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Templates + Copyright : Copyright (C) 2009-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +A simple templating system with variable substitution and conditionals. + +-} + +module Text.Pandoc.Templates ( renderTemplate + , renderTemplate' + , TemplateTarget + , varListToJSON + , compileTemplate + , Template + , getDefaultTemplate ) where + +import Text.DocTemplates (Template, TemplateTarget, compileTemplate, + renderTemplate, applyTemplate, + varListToJSON) +import Data.Aeson (ToJSON(..)) +import qualified Data.Text as T +import System.FilePath ((</>), (<.>)) +import qualified Control.Exception.Extensible as E (try, IOException) +import Text.Pandoc.Shared (readDataFileUTF8) + +-- | Get default template for the specified writer. +getDefaultTemplate :: (Maybe FilePath) -- ^ User data directory to search first + -> String -- ^ Name of writer + -> IO (Either E.IOException String) +getDefaultTemplate user writer = do + let format = takeWhile (`notElem` ("+-" :: String)) writer -- strip off extensions + case format of + "native" -> return $ Right "" + "json" -> return $ Right "" + "docx" -> return $ Right "" + "fb2" -> return $ Right "" + "odt" -> getDefaultTemplate user "opendocument" + "html" -> getDefaultTemplate user "html5" + "docbook" -> getDefaultTemplate user "docbook5" + "epub" -> getDefaultTemplate user "epub3" + "markdown_strict" -> getDefaultTemplate user "markdown" + "multimarkdown" -> getDefaultTemplate user "markdown" + "markdown_github" -> getDefaultTemplate user "markdown" + "markdown_mmd" -> getDefaultTemplate user "markdown" + "markdown_phpextra" -> getDefaultTemplate user "markdown" + _ -> let fname = "templates" </> "default" <.> format + in E.try $ readDataFileUTF8 user fname + +-- | Like 'applyTemplate', but raising an error if compilation fails. +renderTemplate' :: (ToJSON a, TemplateTarget b) => String -> a -> b +renderTemplate' template = either error id . applyTemplate (T.pack template) + diff --git a/src/Text/Pandoc/UTF8.hs b/src/Text/Pandoc/UTF8.hs new file mode 100644 index 000000000..62a662029 --- /dev/null +++ b/src/Text/Pandoc/UTF8.hs @@ -0,0 +1,121 @@ +{- +Copyright (C) 2010-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.UTF8 + Copyright : Copyright (C) 2010-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +UTF-8 aware string IO functions that will work with GHC 6.10, 6.12, or 7. +-} +module Text.Pandoc.UTF8 ( readFile + , writeFile + , getContents + , putStr + , putStrLn + , hPutStr + , hPutStrLn + , hGetContents + , toString + , fromString + , toStringLazy + , fromStringLazy + , encodePath + , decodeArg + ) + +where + +import System.IO hiding (readFile, writeFile, getContents, + putStr, putStrLn, hPutStr, hPutStrLn, hGetContents) +import Prelude hiding (readFile, writeFile, getContents, putStr, putStrLn) +import qualified System.IO as IO +import qualified Data.ByteString.Char8 as B +import qualified Data.ByteString.Lazy as BL +import qualified Data.Text.Encoding as T +import qualified Data.Text as T +import qualified Data.Text.Lazy as TL +import qualified Data.Text.Lazy.Encoding as TL + +readFile :: FilePath -> IO String +readFile f = do + h <- openFile (encodePath f) ReadMode + hGetContents h + +writeFile :: FilePath -> String -> IO () +writeFile f s = withFile (encodePath f) WriteMode $ \h -> hPutStr h s + +getContents :: IO String +getContents = hGetContents stdin + +putStr :: String -> IO () +putStr s = hPutStr stdout s + +putStrLn :: String -> IO () +putStrLn s = hPutStrLn stdout s + +hPutStr :: Handle -> String -> IO () +hPutStr h s = hSetEncoding h utf8 >> IO.hPutStr h s + +hPutStrLn :: Handle -> String -> IO () +hPutStrLn h s = hSetEncoding h utf8 >> IO.hPutStrLn h s + +hGetContents :: Handle -> IO String +hGetContents = fmap toString . B.hGetContents +-- hGetContents h = hSetEncoding h utf8_bom +-- >> hSetNewlineMode h universalNewlineMode +-- >> IO.hGetContents h + +-- | Drop BOM (byte order marker) if present at beginning of string. +-- Note that Data.Text converts the BOM to code point FEFF, zero-width +-- no-break space, so if the string begins with this we strip it off. +dropBOM :: String -> String +dropBOM ('\xFEFF':xs) = xs +dropBOM xs = xs + +filterCRs :: String -> String +filterCRs ('\r':'\n':xs) = '\n': filterCRs xs +filterCRs ('\r':xs) = '\n' : filterCRs xs +filterCRs (x:xs) = x : filterCRs xs +filterCRs [] = [] + +-- | Convert UTF8-encoded ByteString to String, also +-- removing '\r' characters. +toString :: B.ByteString -> String +toString = filterCRs . dropBOM . T.unpack . T.decodeUtf8 + +fromString :: String -> B.ByteString +fromString = T.encodeUtf8 . T.pack + +-- | Convert UTF8-encoded ByteString to String, also +-- removing '\r' characters. +toStringLazy :: BL.ByteString -> String +toStringLazy = filterCRs . dropBOM . TL.unpack . TL.decodeUtf8 + +fromStringLazy :: String -> BL.ByteString +fromStringLazy = TL.encodeUtf8 . TL.pack + +encodePath :: FilePath -> FilePath +encodePath = id + +decodeArg :: String -> String +decodeArg = id diff --git a/src/Text/Pandoc/UUID.hs b/src/Text/Pandoc/UUID.hs new file mode 100644 index 000000000..8de102742 --- /dev/null +++ b/src/Text/Pandoc/UUID.hs @@ -0,0 +1,78 @@ +{- +Copyright (C) 2010-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.UUID + Copyright : Copyright (C) 2010-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +UUID generation using Version 4 (random method) described +in RFC4122. See http://tools.ietf.org/html/rfc4122 +-} + +module Text.Pandoc.UUID ( UUID(..), getRandomUUID, getUUID ) where + +import Text.Printf ( printf ) +import System.Random ( RandomGen, randoms, getStdGen ) +import Data.Word +import Data.Bits ( setBit, clearBit ) + +data UUID = UUID Word8 Word8 Word8 Word8 Word8 Word8 Word8 Word8 + Word8 Word8 Word8 Word8 Word8 Word8 Word8 Word8 + +instance Show UUID where + show (UUID a b c d e f g h i j k l m n o p) = + "urn:uuid:" ++ + printf "%02x" a ++ + printf "%02x" b ++ + printf "%02x" c ++ + printf "%02x" d ++ + "-" ++ + printf "%02x" e ++ + printf "%02x" f ++ + "-" ++ + printf "%02x" g ++ + printf "%02x" h ++ + "-" ++ + printf "%02x" i ++ + printf "%02x" j ++ + "-" ++ + printf "%02x" k ++ + printf "%02x" l ++ + printf "%02x" m ++ + printf "%02x" n ++ + printf "%02x" o ++ + printf "%02x" p + +getUUID :: RandomGen g => g -> UUID +getUUID gen = + let [a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p] = take 16 $ randoms gen :: [Word8] + -- set variant + i' = i `setBit` 7 `clearBit` 6 + -- set version (0100 for random) + g' = g `clearBit` 7 `setBit` 6 `clearBit` 5 `clearBit` 4 + in + UUID a b c d e f g' h i' j k l m n o p + +getRandomUUID :: IO UUID +getRandomUUID = getUUID <$> getStdGen + diff --git a/src/Text/Pandoc/Writers/AsciiDoc.hs b/src/Text/Pandoc/Writers/AsciiDoc.hs new file mode 100644 index 000000000..356b29504 --- /dev/null +++ b/src/Text/Pandoc/Writers/AsciiDoc.hs @@ -0,0 +1,470 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.AsciiDoc + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to asciidoc. + +Note that some information may be lost in conversion, due to +expressive limitations of asciidoc. Footnotes and table cells with +paragraphs (or other block items) are not possible in asciidoc. +If pandoc encounters one of these, it will insert a message indicating +that it has omitted the construct. + +AsciiDoc: <http://www.methods.co.nz/asciidoc/> +-} +module Text.Pandoc.Writers.AsciiDoc (writeAsciiDoc) where +import Text.Pandoc.Definition +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (blankline, space) +import Data.Maybe (fromMaybe) +import Data.List ( stripPrefix, intersperse, intercalate ) +import Text.Pandoc.Pretty +import Text.Pandoc.ImageSize +import Control.Monad.State +import qualified Data.Map as M +import Data.Aeson (Value(String), fromJSON, toJSON, Result(..)) +import qualified Data.Text as T +import Data.Char (isSpace, isPunctuation) +import Text.Pandoc.Class (PandocMonad) + +data WriterState = WriterState { defListMarker :: String + , orderedListLevel :: Int + , bulletListLevel :: Int + , intraword :: Bool + } + +-- | Convert Pandoc to AsciiDoc. +writeAsciiDoc :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeAsciiDoc opts document = return $ + evalState (pandocToAsciiDoc opts document) WriterState{ + defListMarker = "::" + , orderedListLevel = 1 + , bulletListLevel = 1 + , intraword = False + } + +-- | Return asciidoc representation of document. +pandocToAsciiDoc :: WriterOptions -> Pandoc -> State WriterState String +pandocToAsciiDoc opts (Pandoc meta blocks) = do + let titleblock = not $ null (docTitle meta) && null (docAuthors meta) && + null (docDate meta) + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + metadata <- metaToJSON opts + (fmap (render colwidth) . blockListToAsciiDoc opts) + (fmap (render colwidth) . inlineListToAsciiDoc opts) + meta + let addTitleLine (String t) = String $ + t <> "\n" <> T.replicate (T.length t) "=" + addTitleLine x = x + let metadata' = case fromJSON metadata of + Success m -> toJSON $ M.adjust addTitleLine + ("title" :: T.Text) m + _ -> metadata + body <- blockListToAsciiDoc opts blocks + let main = render colwidth body + let context = defField "body" main + $ defField "toc" + (writerTableOfContents opts && + writerTemplate opts /= Nothing) + $ defField "titleblock" titleblock + $ metadata' + case writerTemplate opts of + Nothing -> return main + Just tpl -> return $ renderTemplate' tpl context + +-- | Escape special characters for AsciiDoc. +escapeString :: String -> String +escapeString = escapeStringUsing escs + where escs = backslashEscapes "{" + +-- | Ordered list start parser for use in Para below. +olMarker :: Parser [Char] ParserState Char +olMarker = do (start, style', delim) <- anyOrderedListMarker + if delim == Period && + (style' == UpperAlpha || (style' == UpperRoman && + start `elem` [1, 5, 10, 50, 100, 500, 1000])) + then spaceChar >> spaceChar + else spaceChar + +-- | True if string begins with an ordered list marker +beginsWithOrderedListMarker :: String -> Bool +beginsWithOrderedListMarker str = + case runParser olMarker defaultParserState "para start" (take 10 str) of + Left _ -> False + Right _ -> True + +-- | Convert Pandoc block element to asciidoc. +blockToAsciiDoc :: WriterOptions -- ^ Options + -> Block -- ^ Block element + -> State WriterState Doc +blockToAsciiDoc _ Null = return empty +blockToAsciiDoc opts (Plain inlines) = do + contents <- inlineListToAsciiDoc opts inlines + return $ contents <> blankline +blockToAsciiDoc opts (Para [Image attr alt (src,'f':'i':'g':':':tit)]) = do + blockToAsciiDoc opts (Para [Image attr alt (src,tit)]) +blockToAsciiDoc opts (Para inlines) = do + contents <- inlineListToAsciiDoc opts inlines + -- escape if para starts with ordered list marker + let esc = if beginsWithOrderedListMarker (render Nothing contents) + then text "\\" + else empty + return $ esc <> contents <> blankline +blockToAsciiDoc opts (LineBlock lns) = do + let docify line = if null line + then return blankline + else inlineListToAsciiDoc opts line + let joinWithLinefeeds = nowrap . mconcat . intersperse cr + contents <- joinWithLinefeeds <$> mapM docify lns + return $ "[verse]" $$ text "--" $$ contents $$ text "--" $$ blankline +blockToAsciiDoc _ (RawBlock f s) + | f == "asciidoc" = return $ text s + | otherwise = return empty +blockToAsciiDoc _ HorizontalRule = + return $ blankline <> text "'''''" <> blankline +blockToAsciiDoc opts (Header level (ident,_,_) inlines) = do + contents <- inlineListToAsciiDoc opts inlines + let len = offset contents + -- ident seem to be empty most of the time and asciidoc will generate them automatically + -- so lets make them not show up when null + let identifier = if (null ident) then empty else ("[[" <> text ident <> "]]") + let setext = writerSetextHeaders opts + return $ + (if setext + then + identifier $$ contents $$ + (case level of + 1 -> text $ replicate len '-' + 2 -> text $ replicate len '~' + 3 -> text $ replicate len '^' + 4 -> text $ replicate len '+' + _ -> empty) <> blankline + else + identifier $$ text (replicate level '=') <> space <> contents <> blankline) +blockToAsciiDoc _ (CodeBlock (_,classes,_) str) = return $ (flush $ + if null classes + then "...." $$ text str $$ "...." + else attrs $$ "----" $$ text str $$ "----") + <> blankline + where attrs = "[" <> text (intercalate "," ("source" : classes)) <> "]" +blockToAsciiDoc opts (BlockQuote blocks) = do + contents <- blockListToAsciiDoc opts blocks + let isBlock (BlockQuote _) = True + isBlock _ = False + -- if there are nested block quotes, put in an open block + let contents' = if any isBlock blocks + then "--" $$ contents $$ "--" + else contents + let cols = offset contents' + let bar = text $ replicate cols '_' + return $ bar $$ chomp contents' $$ bar <> blankline +blockToAsciiDoc opts (Table caption aligns widths headers rows) = do + caption' <- inlineListToAsciiDoc opts caption + let caption'' = if null caption + then empty + else "." <> caption' <> cr + let isSimple = all (== 0) widths + let relativePercentWidths = if isSimple + then widths + else map (/ (sum widths)) widths + let widths'' :: [Integer] + widths'' = map (floor . (* 100)) relativePercentWidths + -- ensure that the widths sum to 100 + let widths' = case widths'' of + _ | isSimple -> widths'' + (w:ws) | sum (w:ws) < 100 + -> (100 - sum ws) : ws + ws -> ws + let totalwidth :: Integer + totalwidth = floor $ sum widths * 100 + let colspec al wi = (case al of + AlignLeft -> "<" + AlignCenter -> "^" + AlignRight -> ">" + AlignDefault -> "") ++ + if wi == 0 then "" else (show wi ++ "%") + let headerspec = if all null headers + then empty + else text "options=\"header\"," + let widthspec = if totalwidth == 0 + then empty + else text "width=" + <> doubleQuotes (text $ show totalwidth ++ "%") + <> text "," + let tablespec = text "[" + <> widthspec + <> text "cols=" + <> doubleQuotes (text $ intercalate "," + $ zipWith colspec aligns widths') + <> text "," + <> headerspec <> text "]" + let makeCell [Plain x] = do d <- blockListToAsciiDoc opts [Plain x] + return $ text "|" <> chomp d + makeCell [Para x] = makeCell [Plain x] + makeCell [] = return $ text "|" + makeCell bs = do d <- blockListToAsciiDoc opts bs + return $ text "a|" $$ d + let makeRow cells = hsep `fmap` mapM makeCell cells + rows' <- mapM makeRow rows + head' <- makeRow headers + let head'' = if all null headers then empty else head' + let colwidth = if writerWrapText opts == WrapAuto + then writerColumns opts + else 100000 + let maxwidth = maximum $ map offset (head':rows') + let body = if maxwidth > colwidth then vsep rows' else vcat rows' + let border = text $ "|" ++ replicate (max 5 (min maxwidth colwidth) - 1) '=' + return $ + caption'' $$ tablespec $$ border $$ head'' $$ body $$ border $$ blankline +blockToAsciiDoc opts (BulletList items) = do + contents <- mapM (bulletListItemToAsciiDoc opts) items + return $ cat contents <> blankline +blockToAsciiDoc opts (OrderedList (_start, sty, _delim) items) = do + let sty' = case sty of + UpperRoman -> UpperAlpha + LowerRoman -> LowerAlpha + x -> x + let markers = orderedListMarkers (1, sty', Period) -- start num not used + let markers' = map (\m -> if length m < 3 + then m ++ replicate (3 - length m) ' ' + else m) markers + contents <- mapM (\(item, num) -> orderedListItemToAsciiDoc opts item num) $ + zip markers' items + return $ cat contents <> blankline +blockToAsciiDoc opts (DefinitionList items) = do + contents <- mapM (definitionListItemToAsciiDoc opts) items + return $ cat contents <> blankline +blockToAsciiDoc opts (Div (ident,_,_) bs) = do + let identifier = if (null ident) then empty else ("[[" <> text ident <> "]]") + contents <- blockListToAsciiDoc opts bs + return $ identifier $$ contents + +-- | Convert bullet list item (list of blocks) to asciidoc. +bulletListItemToAsciiDoc :: WriterOptions -> [Block] -> State WriterState Doc +bulletListItemToAsciiDoc opts blocks = do + let addBlock :: Doc -> Block -> State WriterState Doc + addBlock d b | isEmpty d = chomp `fmap` blockToAsciiDoc opts b + addBlock d b@(BulletList _) = do x <- blockToAsciiDoc opts b + return $ d <> cr <> chomp x + addBlock d b@(OrderedList _ _) = do x <- blockToAsciiDoc opts b + return $ d <> cr <> chomp x + addBlock d b = do x <- blockToAsciiDoc opts b + return $ d <> cr <> text "+" <> cr <> chomp x + lev <- bulletListLevel `fmap` get + modify $ \s -> s{ bulletListLevel = lev + 1 } + contents <- foldM addBlock empty blocks + modify $ \s -> s{ bulletListLevel = lev } + let marker = text (replicate lev '*') + return $ marker <> text " " <> contents <> cr + +-- | Convert ordered list item (a list of blocks) to asciidoc. +orderedListItemToAsciiDoc :: WriterOptions -- ^ options + -> String -- ^ list item marker + -> [Block] -- ^ list item (list of blocks) + -> State WriterState Doc +orderedListItemToAsciiDoc opts marker blocks = do + let addBlock :: Doc -> Block -> State WriterState Doc + addBlock d b | isEmpty d = chomp `fmap` blockToAsciiDoc opts b + addBlock d b@(BulletList _) = do x <- blockToAsciiDoc opts b + return $ d <> cr <> chomp x + addBlock d b@(OrderedList _ _) = do x <- blockToAsciiDoc opts b + return $ d <> cr <> chomp x + addBlock d b = do x <- blockToAsciiDoc opts b + return $ d <> cr <> text "+" <> cr <> chomp x + lev <- orderedListLevel `fmap` get + modify $ \s -> s{ orderedListLevel = lev + 1 } + contents <- foldM addBlock empty blocks + modify $ \s -> s{ orderedListLevel = lev } + return $ text marker <> text " " <> contents <> cr + +-- | Convert definition list item (label, list of blocks) to asciidoc. +definitionListItemToAsciiDoc :: WriterOptions + -> ([Inline],[[Block]]) + -> State WriterState Doc +definitionListItemToAsciiDoc opts (label, defs) = do + labelText <- inlineListToAsciiDoc opts label + marker <- defListMarker `fmap` get + if marker == "::" + then modify (\st -> st{ defListMarker = ";;"}) + else modify (\st -> st{ defListMarker = "::"}) + let divider = cr <> text "+" <> cr + let defsToAsciiDoc :: [Block] -> State WriterState Doc + defsToAsciiDoc ds = (vcat . intersperse divider . map chomp) + `fmap` mapM (blockToAsciiDoc opts) ds + defs' <- mapM defsToAsciiDoc defs + modify (\st -> st{ defListMarker = marker }) + let contents = nest 2 $ vcat $ intersperse divider $ map chomp defs' + return $ labelText <> text marker <> cr <> contents <> cr + +-- | Convert list of Pandoc block elements to asciidoc. +blockListToAsciiDoc :: WriterOptions -- ^ Options + -> [Block] -- ^ List of block elements + -> State WriterState Doc +blockListToAsciiDoc opts blocks = cat `fmap` mapM (blockToAsciiDoc opts) blocks + +data SpacyLocation = End | Start + +-- | Convert list of Pandoc inline elements to asciidoc. +inlineListToAsciiDoc :: WriterOptions -> [Inline] -> State WriterState Doc +inlineListToAsciiDoc opts lst = do + oldIntraword <- gets intraword + setIntraword False + result <- go lst + setIntraword oldIntraword + return result + where go [] = return empty + go (y:x:xs) + | not (isSpacy End y) = do + y' <- if isSpacy Start x + then inlineToAsciiDoc opts y + else withIntraword $ inlineToAsciiDoc opts y + x' <- withIntraword $ inlineToAsciiDoc opts x + xs' <- go xs + return (y' <> x' <> xs') + | not (isSpacy Start x) = do + y' <- withIntraword $ inlineToAsciiDoc opts y + xs' <- go (x:xs) + return (y' <> xs') + go (x:xs) = do + x' <- inlineToAsciiDoc opts x + xs' <- go xs + return (x' <> xs') + isSpacy :: SpacyLocation -> Inline -> Bool + isSpacy _ Space = True + isSpacy _ LineBreak = True + isSpacy _ SoftBreak = True + -- Note that \W characters count as spacy in AsciiDoc + -- for purposes of determining interword: + isSpacy End (Str xs) = case reverse xs of + c:_ -> isPunctuation c || isSpace c + _ -> False + isSpacy Start (Str (c:_)) = isPunctuation c || isSpace c + isSpacy _ _ = False + +setIntraword :: Bool -> State WriterState () +setIntraword b = modify $ \st -> st{ intraword = b } + +withIntraword :: State WriterState a -> State WriterState a +withIntraword p = setIntraword True *> p <* setIntraword False + +-- | Convert Pandoc inline element to asciidoc. +inlineToAsciiDoc :: WriterOptions -> Inline -> State WriterState Doc +inlineToAsciiDoc opts (Emph lst) = do + contents <- inlineListToAsciiDoc opts lst + isIntraword <- gets intraword + let marker = if isIntraword then "__" else "_" + return $ marker <> contents <> marker +inlineToAsciiDoc opts (Strong lst) = do + contents <- inlineListToAsciiDoc opts lst + isIntraword <- gets intraword + let marker = if isIntraword then "**" else "*" + return $ marker <> contents <> marker +inlineToAsciiDoc opts (Strikeout lst) = do + contents <- inlineListToAsciiDoc opts lst + return $ "[line-through]*" <> contents <> "*" +inlineToAsciiDoc opts (Superscript lst) = do + contents <- inlineListToAsciiDoc opts lst + return $ "^" <> contents <> "^" +inlineToAsciiDoc opts (Subscript lst) = do + contents <- inlineListToAsciiDoc opts lst + return $ "~" <> contents <> "~" +inlineToAsciiDoc opts (SmallCaps lst) = inlineListToAsciiDoc opts lst +inlineToAsciiDoc opts (Quoted SingleQuote lst) = + inlineListToAsciiDoc opts (Str "`" : lst ++ [Str "'"]) +inlineToAsciiDoc opts (Quoted DoubleQuote lst) = + inlineListToAsciiDoc opts (Str "``" : lst ++ [Str "''"]) +inlineToAsciiDoc _ (Code _ str) = return $ + text "`" <> text (escapeStringUsing (backslashEscapes "`") str) <> "`" +inlineToAsciiDoc _ (Str str) = return $ text $ escapeString str +inlineToAsciiDoc _ (Math InlineMath str) = + return $ "latexmath:[$" <> text str <> "$]" +inlineToAsciiDoc _ (Math DisplayMath str) = + return $ "latexmath:[\\[" <> text str <> "\\]]" +inlineToAsciiDoc _ (RawInline f s) + | f == "asciidoc" = return $ text s + | otherwise = return empty +inlineToAsciiDoc _ LineBreak = return $ " +" <> cr +inlineToAsciiDoc _ Space = return space +inlineToAsciiDoc opts SoftBreak = + case writerWrapText opts of + WrapAuto -> return space + WrapPreserve -> return cr + WrapNone -> return space +inlineToAsciiDoc opts (Cite _ lst) = inlineListToAsciiDoc opts lst +inlineToAsciiDoc opts (Link _ txt (src, _tit)) = do +-- relative: link:downloads/foo.zip[download foo.zip] +-- abs: http://google.cod[Google] +-- or my@email.com[email john] + linktext <- inlineListToAsciiDoc opts txt + let isRelative = ':' `notElem` src + let prefix = if isRelative + then text "link:" + else empty + let srcSuffix = fromMaybe src (stripPrefix "mailto:" src) + let useAuto = case txt of + [Str s] | escapeURI s == srcSuffix -> True + _ -> False + return $ if useAuto + then text srcSuffix + else prefix <> text src <> "[" <> linktext <> "]" +inlineToAsciiDoc opts (Image attr alternate (src, tit)) = do +-- image:images/logo.png[Company logo, title="blah"] + let txt = if (null alternate) || (alternate == [Str ""]) + then [Str "image"] + else alternate + linktext <- inlineListToAsciiDoc opts txt + let linktitle = if null tit + then empty + else ",title=\"" <> text tit <> "\"" + showDim dir = case (dimension dir attr) of + Just (Percent a) -> + ["scaledwidth=" <> text (show (Percent a))] + Just dim -> + [text (show dir) <> "=" <> text (showInPixel opts dim)] + Nothing -> + [] + dimList = showDim Width ++ showDim Height + dims = if null dimList + then empty + else "," <> cat (intersperse "," dimList) + return $ "image:" <> text src <> "[" <> linktext <> linktitle <> dims <> "]" +inlineToAsciiDoc opts (Note [Para inlines]) = + inlineToAsciiDoc opts (Note [Plain inlines]) +inlineToAsciiDoc opts (Note [Plain inlines]) = do + contents <- inlineListToAsciiDoc opts inlines + return $ text "footnote:[" <> contents <> "]" +-- asciidoc can't handle blank lines in notes +inlineToAsciiDoc _ (Note _) = return "[multiblock footnote omitted]" +inlineToAsciiDoc opts (Span (ident,_,_) ils) = do + let identifier = if (null ident) then empty else ("[[" <> text ident <> "]]") + contents <- inlineListToAsciiDoc opts ils + return $ identifier <> contents diff --git a/src/Text/Pandoc/Writers/CommonMark.hs b/src/Text/Pandoc/Writers/CommonMark.hs new file mode 100644 index 000000000..b83f6785d --- /dev/null +++ b/src/Text/Pandoc/Writers/CommonMark.hs @@ -0,0 +1,190 @@ +{- +Copyright (C) 2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.CommonMark + Copyright : Copyright (C) 2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to CommonMark. + +CommonMark: <http://commonmark.org> +-} +module Text.Pandoc.Writers.CommonMark (writeCommonMark) where + +import Text.Pandoc.Writers.HTML (writeHtml5String) +import Text.Pandoc.Definition +import Text.Pandoc.Shared (isTightList, linesToPara) +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import CMark +import qualified Data.Text as T +import Control.Monad.State (runState, State, modify, get) +import Text.Pandoc.Walk (walkM) +import Text.Pandoc.Class (PandocMonad) +import Data.Foldable (foldrM) + +-- | Convert Pandoc to CommonMark. +writeCommonMark :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeCommonMark opts (Pandoc meta blocks) = do + let (blocks', notes) = runState (walkM processNotes blocks) [] + notes' = if null notes + then [] + else [OrderedList (1, Decimal, Period) $ reverse notes] + main <- blocksToCommonMark opts (blocks' ++ notes') + metadata <- metaToJSON opts + (blocksToCommonMark opts) + (inlinesToCommonMark opts) + meta + let context = defField "body" main $ metadata + return $ case writerTemplate opts of + Nothing -> main + Just tpl -> renderTemplate' tpl context + +processNotes :: Inline -> State [[Block]] Inline +processNotes (Note bs) = do + modify (bs :) + notes <- get + return $ Str $ "[" ++ show (length notes) ++ "]" +processNotes x = return x + +node :: NodeType -> [Node] -> Node +node = Node Nothing + +blocksToCommonMark :: PandocMonad m => WriterOptions -> [Block] -> m String +blocksToCommonMark opts bs = do + let cmarkOpts = [optHardBreaks | isEnabled Ext_hard_line_breaks opts] + colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + nodes <- blocksToNodes bs + return $ + T.unpack $ + nodeToCommonmark cmarkOpts colwidth $ + node DOCUMENT nodes + +inlinesToCommonMark :: PandocMonad m => WriterOptions -> [Inline] -> m String +inlinesToCommonMark opts ils = return $ + T.unpack $ nodeToCommonmark cmarkOpts colwidth + $ node PARAGRAPH (inlinesToNodes ils) + where cmarkOpts = [optHardBreaks | isEnabled Ext_hard_line_breaks opts] + colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + +blocksToNodes :: PandocMonad m => [Block] -> m [Node] +blocksToNodes = foldrM blockToNodes [] + +blockToNodes :: PandocMonad m => Block -> [Node] -> m [Node] +blockToNodes (Plain xs) ns = return (node PARAGRAPH (inlinesToNodes xs) : ns) +blockToNodes (Para xs) ns = return (node PARAGRAPH (inlinesToNodes xs) : ns) +blockToNodes (LineBlock lns) ns = blockToNodes (linesToPara lns) ns +blockToNodes (CodeBlock (_,classes,_) xs) ns = return $ + (node (CODE_BLOCK (T.pack (unwords classes)) (T.pack xs)) [] : ns) +blockToNodes (RawBlock fmt xs) ns + | fmt == Format "html" = return (node (HTML_BLOCK (T.pack xs)) [] : ns) + | otherwise = return (node (CUSTOM_BLOCK (T.pack xs) (T.empty)) [] : ns) +blockToNodes (BlockQuote bs) ns = do + nodes <- blocksToNodes bs + return (node BLOCK_QUOTE nodes : ns) +blockToNodes (BulletList items) ns = do + nodes <- mapM blocksToNodes items + return (node (LIST ListAttributes{ + listType = BULLET_LIST, + listDelim = PERIOD_DELIM, + listTight = isTightList items, + listStart = 1 }) (map (node ITEM) nodes) : ns) +blockToNodes (OrderedList (start, _sty, delim) items) ns = do + nodes <- mapM blocksToNodes items + return (node (LIST ListAttributes{ + listType = ORDERED_LIST, + listDelim = case delim of + OneParen -> PAREN_DELIM + TwoParens -> PAREN_DELIM + _ -> PERIOD_DELIM, + listTight = isTightList items, + listStart = start }) (map (node ITEM) nodes) : ns) +blockToNodes HorizontalRule ns = return (node THEMATIC_BREAK [] : ns) +blockToNodes (Header lev _ ils) ns = return (node (HEADING lev) (inlinesToNodes ils) : ns) +blockToNodes (Div _ bs) ns = do + nodes <- blocksToNodes bs + return (nodes ++ ns) +blockToNodes (DefinitionList items) ns = blockToNodes (BulletList items') ns + where items' = map dlToBullet items + dlToBullet (term, ((Para xs : ys) : zs)) = + Para (term ++ [LineBreak] ++ xs) : ys ++ concat zs + dlToBullet (term, ((Plain xs : ys) : zs)) = + Plain (term ++ [LineBreak] ++ xs) : ys ++ concat zs + dlToBullet (term, xs) = + Para term : concat xs +blockToNodes t@(Table _ _ _ _ _) ns = do + s <- writeHtml5String def $! Pandoc nullMeta [t] + return (node (HTML_BLOCK (T.pack $! s)) [] : ns) +blockToNodes Null ns = return ns + +inlinesToNodes :: [Inline] -> [Node] +inlinesToNodes = foldr inlineToNodes [] + +inlineToNodes :: Inline -> [Node] -> [Node] +inlineToNodes (Str s) = (node (TEXT (T.pack s)) [] :) +inlineToNodes Space = (node (TEXT (T.pack " ")) [] :) +inlineToNodes LineBreak = (node LINEBREAK [] :) +inlineToNodes SoftBreak = (node SOFTBREAK [] :) +inlineToNodes (Emph xs) = (node EMPH (inlinesToNodes xs) :) +inlineToNodes (Strong xs) = (node STRONG (inlinesToNodes xs) :) +inlineToNodes (Strikeout xs) = + ((node (HTML_INLINE (T.pack "<s>")) [] : inlinesToNodes xs ++ + [node (HTML_INLINE (T.pack "</s>")) []]) ++ ) +inlineToNodes (Superscript xs) = + ((node (HTML_INLINE (T.pack "<sup>")) [] : inlinesToNodes xs ++ + [node (HTML_INLINE (T.pack "</sup>")) []]) ++ ) +inlineToNodes (Subscript xs) = + ((node (HTML_INLINE (T.pack "<sub>")) [] : inlinesToNodes xs ++ + [node (HTML_INLINE (T.pack "</sub>")) []]) ++ ) +inlineToNodes (SmallCaps xs) = + ((node (HTML_INLINE (T.pack "<span style=\"font-variant:small-caps;\">")) [] + : inlinesToNodes xs ++ + [node (HTML_INLINE (T.pack "</span>")) []]) ++ ) +inlineToNodes (Link _ ils (url,tit)) = + (node (LINK (T.pack url) (T.pack tit)) (inlinesToNodes ils) :) +inlineToNodes (Image _ ils (url,tit)) = + (node (IMAGE (T.pack url) (T.pack tit)) (inlinesToNodes ils) :) +inlineToNodes (RawInline fmt xs) + | fmt == Format "html" = (node (HTML_INLINE (T.pack xs)) [] :) + | otherwise = (node (CUSTOM_INLINE (T.pack xs) (T.empty)) [] :) +inlineToNodes (Quoted qt ils) = + ((node (TEXT start) [] : inlinesToNodes ils ++ [node (TEXT end) []]) ++) + where (start, end) = case qt of + SingleQuote -> (T.pack "‘", T.pack "’") + DoubleQuote -> (T.pack "“", T.pack "”") +inlineToNodes (Code _ str) = (node (CODE (T.pack str)) [] :) +inlineToNodes (Math mt str) = + case mt of + InlineMath -> + (node (HTML_INLINE (T.pack ("\\(" ++ str ++ "\\)"))) [] :) + DisplayMath -> + (node (HTML_INLINE (T.pack ("\\[" ++ str ++ "\\]"))) [] :) +inlineToNodes (Span _ ils) = (inlinesToNodes ils ++) +inlineToNodes (Cite _ ils) = (inlinesToNodes ils ++) +inlineToNodes (Note _) = id -- should not occur +-- we remove Note elements in preprocessing diff --git a/src/Text/Pandoc/Writers/ConTeXt.hs b/src/Text/Pandoc/Writers/ConTeXt.hs new file mode 100644 index 000000000..ea8b90db3 --- /dev/null +++ b/src/Text/Pandoc/Writers/ConTeXt.hs @@ -0,0 +1,481 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2007-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.ConTeXt + Copyright : Copyright (C) 2007-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' format into ConTeXt. +-} +module Text.Pandoc.Writers.ConTeXt ( writeConTeXt ) where +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import Text.Pandoc.Walk (query) +import Text.Printf ( printf ) +import Data.List ( intercalate, intersperse ) +import Data.Char ( ord ) +import Data.Maybe ( catMaybes ) +import Control.Monad.State +import Text.Pandoc.Pretty +import Text.Pandoc.ImageSize +import Text.Pandoc.Templates ( renderTemplate' ) +import Network.URI ( isURI, unEscapeString ) +import Text.Pandoc.Class (PandocMonad) + +data WriterState = + WriterState { stNextRef :: Int -- number of next URL reference + , stOrderedListLevel :: Int -- level of ordered list + , stOptions :: WriterOptions -- writer options + } + +orderedListStyles :: [Char] +orderedListStyles = cycle "narg" + +-- | Convert Pandoc to ConTeXt. +writeConTeXt :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeConTeXt options document = return $ + let defaultWriterState = WriterState { stNextRef = 1 + , stOrderedListLevel = 0 + , stOptions = options + } + in evalState (pandocToConTeXt options document) defaultWriterState + +pandocToConTeXt :: WriterOptions -> Pandoc -> State WriterState String +pandocToConTeXt options (Pandoc meta blocks) = do + let colwidth = if writerWrapText options == WrapAuto + then Just $ writerColumns options + else Nothing + metadata <- metaToJSON options + (fmap (render colwidth) . blockListToConTeXt) + (fmap (render colwidth) . inlineListToConTeXt) + meta + body <- mapM (elementToConTeXt options) $ hierarchicalize blocks + let main = (render colwidth . vcat) body + let layoutFromMargins = intercalate [','] $ catMaybes $ + map (\(x,y) -> + ((x ++ "=") ++) <$> getField y metadata) + [("leftmargin","margin-left") + ,("rightmargin","margin-right") + ,("top","margin-top") + ,("bottom","margin-bottom") + ] + let context = defField "toc" (writerTableOfContents options) + $ defField "placelist" (intercalate ("," :: String) $ + take (writerTOCDepth options + + case writerTopLevelDivision options of + TopLevelPart -> 0 + TopLevelChapter -> 0 + _ -> 1) + ["chapter","section","subsection","subsubsection", + "subsubsubsection","subsubsubsubsection"]) + $ defField "body" main + $ defField "layout" layoutFromMargins + $ defField "number-sections" (writerNumberSections options) + $ metadata + let context' = defField "context-lang" (maybe "" (fromBcp47 . splitBy (=='-')) $ + getField "lang" context) + $ defField "context-dir" (toContextDir $ getField "dir" context) + $ context + return $ case writerTemplate options of + Nothing -> main + Just tpl -> renderTemplate' tpl context' + +toContextDir :: Maybe String -> String +toContextDir (Just "rtl") = "r2l" +toContextDir (Just "ltr") = "l2r" +toContextDir _ = "" + +-- | escape things as needed for ConTeXt +escapeCharForConTeXt :: WriterOptions -> Char -> String +escapeCharForConTeXt opts ch = + let ligatures = isEnabled Ext_smart opts in + case ch of + '{' -> "\\{" + '}' -> "\\}" + '\\' -> "\\letterbackslash{}" + '$' -> "\\$" + '|' -> "\\letterbar{}" + '%' -> "\\letterpercent{}" + '~' -> "\\lettertilde{}" + '#' -> "\\#" + '[' -> "{[}" + ']' -> "{]}" + '\160' -> "~" + '\x2014' | ligatures -> "---" + '\x2013' | ligatures -> "--" + '\x2019' | ligatures -> "'" + '\x2026' -> "\\ldots{}" + x -> [x] + +-- | Escape string for ConTeXt +stringToConTeXt :: WriterOptions -> String -> String +stringToConTeXt opts = concatMap (escapeCharForConTeXt opts) + +-- | Sanitize labels +toLabel :: String -> String +toLabel z = concatMap go z + where go x + | elem x ("\\#[]\",{}%()|=" :: String) = "ux" ++ printf "%x" (ord x) + | otherwise = [x] + +-- | Convert Elements to ConTeXt +elementToConTeXt :: WriterOptions -> Element -> State WriterState Doc +elementToConTeXt _ (Blk block) = blockToConTeXt block +elementToConTeXt opts (Sec level _ attr title' elements) = do + header' <- sectionHeader attr level title' + innerContents <- mapM (elementToConTeXt opts) elements + return $ vcat (header' : innerContents) + +-- | Convert Pandoc block element to ConTeXt. +blockToConTeXt :: Block + -> State WriterState Doc +blockToConTeXt Null = return empty +blockToConTeXt (Plain lst) = inlineListToConTeXt lst +-- title beginning with fig: indicates that the image is a figure +blockToConTeXt (Para [Image attr txt (src,'f':'i':'g':':':_)]) = do + capt <- inlineListToConTeXt txt + img <- inlineToConTeXt (Image attr txt (src, "")) + let (ident, _, _) = attr + label = if null ident + then empty + else "[]" <> brackets (text $ toLabel ident) + return $ blankline $$ "\\placefigure" <> label <> braces capt <> img <> blankline +blockToConTeXt (Para lst) = do + contents <- inlineListToConTeXt lst + return $ contents <> blankline +blockToConTeXt (LineBlock lns) = do + doclines <- nowrap . vcat <$> mapM inlineListToConTeXt lns + return $ "\\startlines" $$ doclines $$ "\\stoplines" <> blankline +blockToConTeXt (BlockQuote lst) = do + contents <- blockListToConTeXt lst + return $ "\\startblockquote" $$ nest 0 contents $$ "\\stopblockquote" <> blankline +blockToConTeXt (CodeBlock _ str) = + return $ flush ("\\starttyping" <> cr <> text str <> cr <> "\\stoptyping") $$ blankline + -- blankline because \stoptyping can't have anything after it, inc. '}' +blockToConTeXt (RawBlock "context" str) = return $ text str <> blankline +blockToConTeXt (RawBlock _ _ ) = return empty +blockToConTeXt (Div (ident,_,kvs) bs) = do + let align dir txt = "\\startalignment[" <> dir <> "]" $$ txt $$ "\\stopalignment" + let wrapRef txt = if null ident + then txt + else ("\\reference" <> brackets (text $ toLabel ident) <> + braces empty <> "%") $$ txt + wrapDir = case lookup "dir" kvs of + Just "rtl" -> align "righttoleft" + Just "ltr" -> align "lefttoright" + _ -> id + wrapLang txt = case lookup "lang" kvs of + Just lng -> "\\start\\language[" + <> text (fromBcp47' lng) <> "]" $$ txt $$ "\\stop" + Nothing -> txt + wrapBlank txt = blankline <> txt <> blankline + fmap (wrapBlank . wrapLang . wrapDir . wrapRef) $ blockListToConTeXt bs +blockToConTeXt (BulletList lst) = do + contents <- mapM listItemToConTeXt lst + return $ ("\\startitemize" <> if isTightList lst + then brackets "packed" + else empty) $$ + vcat contents $$ text "\\stopitemize" <> blankline +blockToConTeXt (OrderedList (start, style', delim) lst) = do + st <- get + let level = stOrderedListLevel st + put $ st {stOrderedListLevel = level + 1} + contents <- mapM listItemToConTeXt lst + put $ st {stOrderedListLevel = level} + let start' = if start == 1 then "" else "start=" ++ show start + let delim' = case delim of + DefaultDelim -> "" + Period -> "stopper=." + OneParen -> "stopper=)" + TwoParens -> "left=(,stopper=)" + let width = maximum $ map length $ take (length contents) + (orderedListMarkers (start, style', delim)) + let width' = (toEnum width + 1) / 2 + let width'' = if width' > (1.5 :: Double) + then "width=" ++ show width' ++ "em" + else "" + let specs2Items = filter (not . null) [start', delim', width''] + let specs2 = if null specs2Items + then "" + else "[" ++ intercalate "," specs2Items ++ "]" + let style'' = '[': (case style' of + DefaultStyle -> orderedListStyles !! level + Decimal -> 'n' + Example -> 'n' + LowerRoman -> 'r' + UpperRoman -> 'R' + LowerAlpha -> 'a' + UpperAlpha -> 'A') : + if isTightList lst then ",packed]" else "]" + let specs = style'' ++ specs2 + return $ "\\startitemize" <> text specs $$ vcat contents $$ + "\\stopitemize" <> blankline +blockToConTeXt (DefinitionList lst) = + liftM vcat $ mapM defListItemToConTeXt lst +blockToConTeXt HorizontalRule = return $ "\\thinrule" <> blankline +-- If this is ever executed, provide a default for the reference identifier. +blockToConTeXt (Header level attr lst) = sectionHeader attr level lst +blockToConTeXt (Table caption aligns widths heads rows) = do + let colDescriptor colWidth alignment = (case alignment of + AlignLeft -> 'l' + AlignRight -> 'r' + AlignCenter -> 'c' + AlignDefault -> 'l'): + if colWidth == 0 + then "|" + else ("p(" ++ printf "%.2f" colWidth ++ "\\textwidth)|") + let colDescriptors = "|" ++ (concat $ + zipWith colDescriptor widths aligns) + headers <- if all null heads + then return empty + else liftM ($$ "\\HL") $ tableRowToConTeXt heads + captionText <- inlineListToConTeXt caption + rows' <- mapM tableRowToConTeXt rows + return $ "\\placetable" <> (if null caption + then brackets "none" + else empty) + <> braces captionText $$ + "\\starttable" <> brackets (text colDescriptors) $$ + "\\HL" $$ headers $$ + vcat rows' $$ "\\HL" $$ "\\stoptable" <> blankline + +tableRowToConTeXt :: [[Block]] -> State WriterState Doc +tableRowToConTeXt cols = do + cols' <- mapM blockListToConTeXt cols + return $ (vcat (map ("\\NC " <>) cols')) $$ "\\NC\\AR" + +listItemToConTeXt :: [Block] -> State WriterState Doc +listItemToConTeXt list = blockListToConTeXt list >>= + return . ("\\item" $$) . (nest 2) + +defListItemToConTeXt :: ([Inline], [[Block]]) -> State WriterState Doc +defListItemToConTeXt (term, defs) = do + term' <- inlineListToConTeXt term + def' <- liftM vsep $ mapM blockListToConTeXt defs + return $ "\\startdescription" <> braces term' $$ nest 2 def' $$ + "\\stopdescription" <> blankline + +-- | Convert list of block elements to ConTeXt. +blockListToConTeXt :: [Block] -> State WriterState Doc +blockListToConTeXt lst = liftM vcat $ mapM blockToConTeXt lst + +-- | Convert list of inline elements to ConTeXt. +inlineListToConTeXt :: [Inline] -- ^ Inlines to convert + -> State WriterState Doc +inlineListToConTeXt lst = liftM hcat $ mapM inlineToConTeXt $ addStruts lst + -- We add a \strut after a line break that precedes a space, + -- or the space gets swallowed + where addStruts (LineBreak : s : xs) | isSpacey s = + LineBreak : RawInline (Format "context") "\\strut " : s : + addStruts xs + addStruts (x:xs) = x : addStruts xs + addStruts [] = [] + isSpacey Space = True + isSpacey (Str ('\160':_)) = True + isSpacey _ = False + +-- | Convert inline element to ConTeXt +inlineToConTeXt :: Inline -- ^ Inline to convert + -> State WriterState Doc +inlineToConTeXt (Emph lst) = do + contents <- inlineListToConTeXt lst + return $ braces $ "\\em " <> contents +inlineToConTeXt (Strong lst) = do + contents <- inlineListToConTeXt lst + return $ braces $ "\\bf " <> contents +inlineToConTeXt (Strikeout lst) = do + contents <- inlineListToConTeXt lst + return $ "\\overstrikes" <> braces contents +inlineToConTeXt (Superscript lst) = do + contents <- inlineListToConTeXt lst + return $ "\\high" <> braces contents +inlineToConTeXt (Subscript lst) = do + contents <- inlineListToConTeXt lst + return $ "\\low" <> braces contents +inlineToConTeXt (SmallCaps lst) = do + contents <- inlineListToConTeXt lst + return $ braces $ "\\sc " <> contents +inlineToConTeXt (Code _ str) | not ('{' `elem` str || '}' `elem` str) = + return $ "\\type" <> braces (text str) +inlineToConTeXt (Code _ str) = do + opts <- gets stOptions + return $ "\\mono" <> braces (text $ stringToConTeXt opts str) +inlineToConTeXt (Quoted SingleQuote lst) = do + contents <- inlineListToConTeXt lst + return $ "\\quote" <> braces contents +inlineToConTeXt (Quoted DoubleQuote lst) = do + contents <- inlineListToConTeXt lst + return $ "\\quotation" <> braces contents +inlineToConTeXt (Cite _ lst) = inlineListToConTeXt lst +inlineToConTeXt (Str str) = do + opts <- gets stOptions + return $ text $ stringToConTeXt opts str +inlineToConTeXt (Math InlineMath str) = + return $ char '$' <> text str <> char '$' +inlineToConTeXt (Math DisplayMath str) = + return $ text "\\startformula " <> text str <> text " \\stopformula" <> space +inlineToConTeXt (RawInline "context" str) = return $ text str +inlineToConTeXt (RawInline "tex" str) = return $ text str +inlineToConTeXt (RawInline _ _) = return empty +inlineToConTeXt (LineBreak) = return $ text "\\crlf" <> cr +inlineToConTeXt SoftBreak = do + wrapText <- gets (writerWrapText . stOptions) + return $ case wrapText of + WrapAuto -> space + WrapNone -> space + WrapPreserve -> cr +inlineToConTeXt Space = return space +-- Handle HTML-like internal document references to sections +inlineToConTeXt (Link _ txt (('#' : ref), _)) = do + opts <- gets stOptions + contents <- inlineListToConTeXt txt + let ref' = toLabel $ stringToConTeXt opts ref + return $ text "\\goto" + <> braces contents + <> brackets (text ref') + +inlineToConTeXt (Link _ txt (src, _)) = do + let isAutolink = txt == [Str (unEscapeString src)] + st <- get + let next = stNextRef st + put $ st {stNextRef = next + 1} + let ref = "url" ++ show next + contents <- inlineListToConTeXt txt + return $ "\\useURL" + <> brackets (text ref) + <> brackets (text $ escapeStringUsing [('#',"\\#"),('%',"\\%")] src) + <> (if isAutolink + then empty + else brackets empty <> brackets contents) + <> "\\from" + <> brackets (text ref) +inlineToConTeXt (Image attr@(_,cls,_) _ (src, _)) = do + opts <- gets stOptions + let showDim dir = let d = text (show dir) <> "=" + in case (dimension dir attr) of + Just (Pixel a) -> + [d <> text (showInInch opts (Pixel a)) <> "in"] + Just (Percent a) -> + [d <> text (showFl (a / 100)) <> "\\textwidth"] + Just dim -> + [d <> text (show dim)] + Nothing -> + [] + dimList = showDim Width ++ showDim Height + dims = if null dimList + then empty + else brackets $ cat (intersperse "," dimList) + clas = if null cls + then empty + else brackets $ text $ toLabel $ head cls + src' = if isURI src + then src + else unEscapeString src + return $ braces $ "\\externalfigure" <> brackets (text src') <> dims <> clas +inlineToConTeXt (Note contents) = do + contents' <- blockListToConTeXt contents + let codeBlock x@(CodeBlock _ _) = [x] + codeBlock _ = [] + let codeBlocks = query codeBlock contents + return $ if null codeBlocks + then text "\\footnote{" <> nest 2 contents' <> char '}' + else text "\\startbuffer " <> nest 2 contents' <> + text "\\stopbuffer\\footnote{\\getbuffer}" +inlineToConTeXt (Span (_,_,kvs) ils) = do + let wrapDir txt = case lookup "dir" kvs of + Just "rtl" -> braces $ "\\righttoleft " <> txt + Just "ltr" -> braces $ "\\lefttoright " <> txt + _ -> txt + wrapLang txt = case lookup "lang" kvs of + Just lng -> "\\start\\language[" <> text (fromBcp47' lng) + <> "]" <> txt <> "\\stop " + Nothing -> txt + fmap (wrapLang . wrapDir) $ inlineListToConTeXt ils + +-- | Craft the section header, inserting the section reference, if supplied. +sectionHeader :: Attr + -> Int + -> [Inline] + -> State WriterState Doc +sectionHeader (ident,classes,_) hdrLevel lst = do + contents <- inlineListToConTeXt lst + st <- get + let opts = stOptions st + let level' = case writerTopLevelDivision opts of + TopLevelPart -> hdrLevel - 2 + TopLevelChapter -> hdrLevel - 1 + TopLevelSection -> hdrLevel + TopLevelDefault -> hdrLevel + let ident' = toLabel ident + let (section, chapter) = if "unnumbered" `elem` classes + then (text "subject", text "title") + else (text "section", text "chapter") + return $ case level' of + -1 -> text "\\part" <> braces contents + 0 -> char '\\' <> chapter <> braces contents + n | n >= 1 && n <= 5 -> char '\\' + <> text (concat (replicate (n - 1) "sub")) + <> section + <> (if (not . null) ident' + then brackets (text ident') + else empty) + <> braces contents + <> blankline + _ -> contents <> blankline + +fromBcp47' :: String -> String +fromBcp47' = fromBcp47 . splitBy (=='-') + +-- Takes a list of the constituents of a BCP 47 language code +-- and irons out ConTeXt's exceptions +-- https://tools.ietf.org/html/bcp47#section-2.1 +-- http://wiki.contextgarden.net/Language_Codes +fromBcp47 :: [String] -> String +fromBcp47 [] = "" +fromBcp47 ("ar":"SY":_) = "ar-sy" +fromBcp47 ("ar":"IQ":_) = "ar-iq" +fromBcp47 ("ar":"JO":_) = "ar-jo" +fromBcp47 ("ar":"LB":_) = "ar-lb" +fromBcp47 ("ar":"DZ":_) = "ar-dz" +fromBcp47 ("ar":"MA":_) = "ar-ma" +fromBcp47 ("de":"1901":_) = "deo" +fromBcp47 ("de":"DE":_) = "de-de" +fromBcp47 ("de":"AT":_) = "de-at" +fromBcp47 ("de":"CH":_) = "de-ch" +fromBcp47 ("el":"poly":_) = "agr" +fromBcp47 ("en":"US":_) = "en-us" +fromBcp47 ("en":"GB":_) = "en-gb" +fromBcp47 ("grc":_) = "agr" +fromBcp47 x = fromIso $ head x + where + fromIso "el" = "gr" + fromIso "eu" = "ba" + fromIso "he" = "il" + fromIso "jp" = "ja" + fromIso "uk" = "ua" + fromIso "vi" = "vn" + fromIso "zh" = "cn" + fromIso l = l diff --git a/src/Text/Pandoc/Writers/Custom.hs b/src/Text/Pandoc/Writers/Custom.hs new file mode 100644 index 000000000..cf641dcd6 --- /dev/null +++ b/src/Text/Pandoc/Writers/Custom.hs @@ -0,0 +1,322 @@ +{-# OPTIONS_GHC -fno-warn-orphans #-} +{-# LANGUAGE FlexibleInstances, OverloadedStrings, + ScopedTypeVariables, DeriveDataTypeable, CPP #-} +#if MIN_VERSION_base(4,8,0) +#else +{-# LANGUAGE OverlappingInstances #-} +#endif +{- Copyright (C) 2012-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Custom + Copyright : Copyright (C) 2012-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to custom markup using +a lua writer. +-} +module Text.Pandoc.Writers.Custom ( writeCustom ) where +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Data.List ( intersperse ) +import Data.Char ( toLower ) +import Data.Typeable +import Scripting.Lua (LuaState, StackValue, callfunc) +import Text.Pandoc.Writers.Shared +import qualified Scripting.Lua as Lua +import qualified Text.Pandoc.UTF8 as UTF8 +import Control.Monad (when) +import Control.Exception +import qualified Data.Map as M +import Text.Pandoc.Templates +import GHC.IO.Encoding (getForeignEncoding,setForeignEncoding, utf8) + +attrToMap :: Attr -> M.Map String String +attrToMap (id',classes,keyvals) = M.fromList + $ ("id", id') + : ("class", unwords classes) + : keyvals + +#if MIN_VERSION_hslua(0,4,0) +#if MIN_VERSION_base(4,8,0) +instance {-# OVERLAPS #-} StackValue [Char] where +#else +instance StackValue [Char] where +#endif + push lua cs = Lua.push lua (UTF8.fromString cs) + peek lua i = do + res <- Lua.peek lua i + return $ UTF8.toString `fmap` res + valuetype _ = Lua.TSTRING +#else +#if MIN_VERSION_base(4,8,0) +instance {-# OVERLAPS #-} StackValue a => StackValue [a] where +#else +instance StackValue a => StackValue [a] where +#endif + push lua xs = do + Lua.createtable lua (length xs + 1) 0 + let addValue (i, x) = Lua.push lua x >> Lua.rawseti lua (-2) i + mapM_ addValue $ zip [1..] xs + peek lua i = do + top <- Lua.gettop lua + let i' = if i < 0 then top + i + 1 else i + Lua.pushnil lua + lst <- getList lua i' + Lua.pop lua 1 + return (Just lst) + valuetype _ = Lua.TTABLE + +getList :: StackValue a => LuaState -> Int -> IO [a] +getList lua i' = do + continue <- Lua.next lua i' + if continue + then do + next <- Lua.peek lua (-1) + Lua.pop lua 1 + x <- maybe (fail "peek returned Nothing") return next + rest <- getList lua i' + return (x : rest) + else return [] +#endif + +instance StackValue Format where + push lua (Format f) = Lua.push lua (map toLower f) + peek l n = fmap Format `fmap` Lua.peek l n + valuetype _ = Lua.TSTRING + +instance (StackValue a, StackValue b) => StackValue (M.Map a b) where + push lua m = do + let xs = M.toList m + Lua.createtable lua (length xs + 1) 0 + let addValue (k, v) = Lua.push lua k >> Lua.push lua v >> + Lua.rawset lua (-3) + mapM_ addValue xs + peek _ _ = undefined -- not needed for our purposes + valuetype _ = Lua.TTABLE + +instance (StackValue a, StackValue b) => StackValue (a,b) where + push lua (k,v) = do + Lua.createtable lua 2 0 + Lua.push lua k + Lua.push lua v + Lua.rawset lua (-3) + peek _ _ = undefined -- not needed for our purposes + valuetype _ = Lua.TTABLE + +#if MIN_VERSION_base(4,8,0) +instance {-# OVERLAPS #-} StackValue [Inline] where +#else +instance StackValue [Inline] where +#endif + push l ils = Lua.push l =<< inlineListToCustom l ils + peek _ _ = undefined + valuetype _ = Lua.TSTRING + +#if MIN_VERSION_base(4,8,0) +instance {-# OVERLAPS #-} StackValue [Block] where +#else +instance StackValue [Block] where +#endif + push l ils = Lua.push l =<< blockListToCustom l ils + peek _ _ = undefined + valuetype _ = Lua.TSTRING + +instance StackValue MetaValue where + push l (MetaMap m) = Lua.push l m + push l (MetaList xs) = Lua.push l xs + push l (MetaBool x) = Lua.push l x + push l (MetaString s) = Lua.push l s + push l (MetaInlines ils) = Lua.push l ils + push l (MetaBlocks bs) = Lua.push l bs + peek _ _ = undefined + valuetype (MetaMap _) = Lua.TTABLE + valuetype (MetaList _) = Lua.TTABLE + valuetype (MetaBool _) = Lua.TBOOLEAN + valuetype (MetaString _) = Lua.TSTRING + valuetype (MetaInlines _) = Lua.TSTRING + valuetype (MetaBlocks _) = Lua.TSTRING + +instance StackValue Citation where + push lua cit = do + Lua.createtable lua 6 0 + let addValue (k :: String, v) = Lua.push lua k >> Lua.push lua v >> + Lua.rawset lua (-3) + addValue ("citationId", citationId cit) + addValue ("citationPrefix", citationPrefix cit) + addValue ("citationSuffix", citationSuffix cit) + addValue ("citationMode", show (citationMode cit)) + addValue ("citationNoteNum", citationNoteNum cit) + addValue ("citationHash", citationHash cit) + peek = undefined + valuetype _ = Lua.TTABLE + +data PandocLuaException = PandocLuaException String + deriving (Show, Typeable) + +instance Exception PandocLuaException + +-- | Convert Pandoc to custom markup. +writeCustom :: FilePath -> WriterOptions -> Pandoc -> IO String +writeCustom luaFile opts doc@(Pandoc meta _) = do + luaScript <- UTF8.readFile luaFile + enc <- getForeignEncoding + setForeignEncoding utf8 + lua <- Lua.newstate + Lua.openlibs lua + status <- Lua.loadstring lua luaScript luaFile + -- check for error in lua script (later we'll change the return type + -- to handle this more gracefully): + when (status /= 0) $ +#if MIN_VERSION_hslua(0,4,0) + Lua.tostring lua 1 >>= throw . PandocLuaException . UTF8.toString +#else + Lua.tostring lua 1 >>= throw . PandocLuaException +#endif + Lua.call lua 0 0 + -- TODO - call hierarchicalize, so we have that info + rendered <- docToCustom lua opts doc + context <- metaToJSON opts + (blockListToCustom lua) + (inlineListToCustom lua) + meta + Lua.close lua + setForeignEncoding enc + let body = rendered + case writerTemplate opts of + Nothing -> return body + Just tpl -> return $ renderTemplate' tpl $ setField "body" body context + +docToCustom :: LuaState -> WriterOptions -> Pandoc -> IO String +docToCustom lua opts (Pandoc (Meta metamap) blocks) = do + body <- blockListToCustom lua blocks + callfunc lua "Doc" body metamap (writerVariables opts) + +-- | Convert Pandoc block element to Custom. +blockToCustom :: LuaState -- ^ Lua state + -> Block -- ^ Block element + -> IO String + +blockToCustom _ Null = return "" + +blockToCustom lua (Plain inlines) = callfunc lua "Plain" inlines + +blockToCustom lua (Para [Image attr txt (src,tit)]) = + callfunc lua "CaptionedImage" src tit txt (attrToMap attr) + +blockToCustom lua (Para inlines) = callfunc lua "Para" inlines + +blockToCustom lua (LineBlock linesList) = callfunc lua "LineBlock" linesList + +blockToCustom lua (RawBlock format str) = + callfunc lua "RawBlock" format str + +blockToCustom lua HorizontalRule = callfunc lua "HorizontalRule" + +blockToCustom lua (Header level attr inlines) = + callfunc lua "Header" level inlines (attrToMap attr) + +blockToCustom lua (CodeBlock attr str) = + callfunc lua "CodeBlock" str (attrToMap attr) + +blockToCustom lua (BlockQuote blocks) = callfunc lua "BlockQuote" blocks + +blockToCustom lua (Table capt aligns widths headers rows') = + callfunc lua "Table" capt (map show aligns) widths headers rows' + +blockToCustom lua (BulletList items) = callfunc lua "BulletList" items + +blockToCustom lua (OrderedList (num,sty,delim) items) = + callfunc lua "OrderedList" items num (show sty) (show delim) + +blockToCustom lua (DefinitionList items) = + callfunc lua "DefinitionList" items + +blockToCustom lua (Div attr items) = + callfunc lua "Div" items (attrToMap attr) + +-- | Convert list of Pandoc block elements to Custom. +blockListToCustom :: LuaState -- ^ Options + -> [Block] -- ^ List of block elements + -> IO String +blockListToCustom lua xs = do + blocksep <- callfunc lua "Blocksep" + bs <- mapM (blockToCustom lua) xs + return $ mconcat $ intersperse blocksep bs + +-- | Convert list of Pandoc inline elements to Custom. +inlineListToCustom :: LuaState -> [Inline] -> IO String +inlineListToCustom lua lst = do + xs <- mapM (inlineToCustom lua) lst + return $ concat xs + +-- | Convert Pandoc inline element to Custom. +inlineToCustom :: LuaState -> Inline -> IO String + +inlineToCustom lua (Str str) = callfunc lua "Str" str + +inlineToCustom lua Space = callfunc lua "Space" + +inlineToCustom lua SoftBreak = callfunc lua "SoftBreak" + +inlineToCustom lua (Emph lst) = callfunc lua "Emph" lst + +inlineToCustom lua (Strong lst) = callfunc lua "Strong" lst + +inlineToCustom lua (Strikeout lst) = callfunc lua "Strikeout" lst + +inlineToCustom lua (Superscript lst) = callfunc lua "Superscript" lst + +inlineToCustom lua (Subscript lst) = callfunc lua "Subscript" lst + +inlineToCustom lua (SmallCaps lst) = callfunc lua "SmallCaps" lst + +inlineToCustom lua (Quoted SingleQuote lst) = callfunc lua "SingleQuoted" lst + +inlineToCustom lua (Quoted DoubleQuote lst) = callfunc lua "DoubleQuoted" lst + +inlineToCustom lua (Cite cs lst) = callfunc lua "Cite" lst cs + +inlineToCustom lua (Code attr str) = + callfunc lua "Code" str (attrToMap attr) + +inlineToCustom lua (Math DisplayMath str) = + callfunc lua "DisplayMath" str + +inlineToCustom lua (Math InlineMath str) = + callfunc lua "InlineMath" str + +inlineToCustom lua (RawInline format str) = + callfunc lua "RawInline" format str + +inlineToCustom lua (LineBreak) = callfunc lua "LineBreak" + +inlineToCustom lua (Link attr txt (src,tit)) = + callfunc lua "Link" txt src tit (attrToMap attr) + +inlineToCustom lua (Image attr alt (src,tit)) = + callfunc lua "Image" alt src tit (attrToMap attr) + +inlineToCustom lua (Note contents) = callfunc lua "Note" contents + +inlineToCustom lua (Span attr items) = + callfunc lua "Span" items (attrToMap attr) diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs new file mode 100644 index 000000000..597851f65 --- /dev/null +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -0,0 +1,440 @@ +{-# LANGUAGE OverloadedStrings, PatternGuards #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Docbook + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to Docbook XML. +-} +module Text.Pandoc.Writers.Docbook ( writeDocbook4, writeDocbook5 ) where +import Text.Pandoc.Definition +import Text.Pandoc.XML +import Text.Pandoc.Shared +import Text.Pandoc.Walk +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.Writers.Math +import Data.List ( stripPrefix, isPrefixOf, intercalate, isSuffixOf ) +import Data.Char ( toLower ) +import Data.Monoid ( Any(..) ) +import Text.Pandoc.Highlighting ( languages, languagesByExtension ) +import Text.Pandoc.Pretty +import Text.Pandoc.ImageSize +import qualified Text.Pandoc.Builder as B +import Text.TeXMath +import qualified Text.XML.Light as Xml +import Data.Generics (everywhere, mkT) +import Text.Pandoc.Class (PandocMonad, report) +import Text.Pandoc.Logging +import Control.Monad.Reader + +data DocBookVersion = DocBook4 | DocBook5 + deriving (Eq, Show) + +type DB = ReaderT DocBookVersion + +-- | Convert list of authors to a docbook <author> section +authorToDocbook :: PandocMonad m => WriterOptions -> [Inline] -> DB m B.Inlines +authorToDocbook opts name' = do + name <- render Nothing <$> inlinesToDocbook opts name' + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + return $ B.rawInline "docbook" $ render colwidth $ + if ',' `elem` name + then -- last name first + let (lastname, rest) = break (==',') name + firstname = triml rest in + inTagsSimple "firstname" (text $ escapeStringForXML firstname) <> + inTagsSimple "surname" (text $ escapeStringForXML lastname) + else -- last name last + let namewords = words name + lengthname = length namewords + (firstname, lastname) = case lengthname of + 0 -> ("","") + 1 -> ("", name) + n -> (intercalate " " (take (n-1) namewords), last namewords) + in inTagsSimple "firstname" (text $ escapeStringForXML firstname) $$ + inTagsSimple "surname" (text $ escapeStringForXML lastname) + +writeDocbook4 :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeDocbook4 opts d = + runReaderT (writeDocbook opts d) DocBook4 + +writeDocbook5 :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeDocbook5 opts d = + runReaderT (writeDocbook opts d) DocBook5 + +-- | Convert Pandoc document to string in Docbook format. +writeDocbook :: PandocMonad m => WriterOptions -> Pandoc -> DB m String +writeDocbook opts (Pandoc meta blocks) = do + let elements = hierarchicalize blocks + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + let render' = render colwidth + let opts' = if (maybe False (("/book>" `isSuffixOf`) . trimr) + (writerTemplate opts) && + TopLevelDefault == writerTopLevelDivision opts) + then opts{ writerTopLevelDivision = TopLevelChapter } + else opts + -- The numbering here follows LaTeX's internal numbering + let startLvl = case writerTopLevelDivision opts' of + TopLevelPart -> -1 + TopLevelChapter -> 0 + TopLevelSection -> 1 + TopLevelDefault -> 1 + auths' <- mapM (authorToDocbook opts) $ docAuthors meta + let meta' = B.setMeta "author" auths' meta + metadata <- metaToJSON opts + (fmap (render colwidth . vcat) . + (mapM (elementToDocbook opts' startLvl) . + hierarchicalize)) + (fmap (render colwidth) . inlinesToDocbook opts') + meta' + main <- (render' . vcat) <$> (mapM (elementToDocbook opts' startLvl) elements) + let context = defField "body" main + $ defField "mathml" (case writerHTMLMathMethod opts of + MathML -> True + _ -> False) + $ metadata + return $ case writerTemplate opts of + Nothing -> main + Just tpl -> renderTemplate' tpl context + +-- | Convert an Element to Docbook. +elementToDocbook :: PandocMonad m => WriterOptions -> Int -> Element -> DB m Doc +elementToDocbook opts _ (Blk block) = blockToDocbook opts block +elementToDocbook opts lvl (Sec _ _num (id',_,_) title elements) = do + version <- ask + -- Docbook doesn't allow sections with no content, so insert some if needed + let elements' = if null elements + then [Blk (Para [])] + else elements + tag = case lvl of + -1 -> "part" + 0 -> "chapter" + n | n >= 1 && n <= 5 -> if version == DocBook5 + then "section" + else "sect" ++ show n + _ -> "simplesect" + idName = if version == DocBook5 + then "xml:id" + else "id" + idAttr = [(idName, writerIdentifierPrefix opts ++ id') | not (null id')] + nsAttr = if version == DocBook5 && lvl == 0 then [("xmlns", "http://docbook.org/ns/docbook"),("xmlns:xlink", "http://www.w3.org/1999/xlink")] + else [] + attribs = nsAttr ++ idAttr + contents <- mapM (elementToDocbook opts (lvl + 1)) elements' + title' <- inlinesToDocbook opts title + return $ inTags True tag attribs $ + inTagsSimple "title" title' $$ vcat contents + +-- | Convert a list of Pandoc blocks to Docbook. +blocksToDocbook :: PandocMonad m => WriterOptions -> [Block] -> DB m Doc +blocksToDocbook opts = fmap vcat . mapM (blockToDocbook opts) + +-- | Auxiliary function to convert Plain block to Para. +plainToPara :: Block -> Block +plainToPara (Plain x) = Para x +plainToPara x = x + +-- | Convert a list of pairs of terms and definitions into a list of +-- Docbook varlistentrys. +deflistItemsToDocbook :: PandocMonad m + => WriterOptions -> [([Inline],[[Block]])] -> DB m Doc +deflistItemsToDocbook opts items = + vcat <$> mapM (\(term, defs) -> deflistItemToDocbook opts term defs) items + +-- | Convert a term and a list of blocks into a Docbook varlistentry. +deflistItemToDocbook :: PandocMonad m + => WriterOptions -> [Inline] -> [[Block]] -> DB m Doc +deflistItemToDocbook opts term defs = do + term' <- inlinesToDocbook opts term + def' <- blocksToDocbook opts $ concatMap (map plainToPara) defs + return $ inTagsIndented "varlistentry" $ + inTagsIndented "term" term' $$ + inTagsIndented "listitem" def' + +-- | Convert a list of lists of blocks to a list of Docbook list items. +listItemsToDocbook :: PandocMonad m => WriterOptions -> [[Block]] -> DB m Doc +listItemsToDocbook opts items = vcat <$> mapM (listItemToDocbook opts) items + +-- | Convert a list of blocks into a Docbook list item. +listItemToDocbook :: PandocMonad m => WriterOptions -> [Block] -> DB m Doc +listItemToDocbook opts item = + inTagsIndented "listitem" <$> blocksToDocbook opts (map plainToPara item) + +imageToDocbook :: WriterOptions -> Attr -> String -> Doc +imageToDocbook _ attr src = selfClosingTag "imagedata" $ + ("fileref", src) : idAndRole attr ++ dims + where + dims = go Width "width" ++ go Height "depth" + go dir dstr = case (dimension dir attr) of + Just a -> [(dstr, show a)] + Nothing -> [] + +-- | Convert a Pandoc block element to Docbook. +blockToDocbook :: PandocMonad m => WriterOptions -> Block -> DB m Doc +blockToDocbook _ Null = return empty +-- Add ids to paragraphs in divs with ids - this is needed for +-- pandoc-citeproc to get link anchors in bibliographies: +blockToDocbook opts (Div (ident,_,_) [Para lst]) = + let attribs = [("id", ident) | not (null ident)] in + if hasLineBreaks lst + then (flush . nowrap . inTags False "literallayout" attribs) + <$> inlinesToDocbook opts lst + else inTags True "para" attribs <$> inlinesToDocbook opts lst +blockToDocbook opts (Div (ident,_,_) bs) = do + contents <- blocksToDocbook opts (map plainToPara bs) + return $ + (if null ident + then mempty + else selfClosingTag "anchor" [("id", ident)]) $$ contents +blockToDocbook _ (Header _ _ _) = + return empty -- should not occur after hierarchicalize +blockToDocbook opts (Plain lst) = inlinesToDocbook opts lst +-- title beginning with fig: indicates that the image is a figure +blockToDocbook opts (Para [Image attr txt (src,'f':'i':'g':':':_)]) = do + alt <- inlinesToDocbook opts txt + let capt = if null txt + then empty + else inTagsSimple "title" alt + return $ inTagsIndented "figure" $ + capt $$ + (inTagsIndented "mediaobject" $ + (inTagsIndented "imageobject" + (imageToDocbook opts attr src)) $$ + inTagsSimple "textobject" (inTagsSimple "phrase" alt)) +blockToDocbook opts (Para lst) + | hasLineBreaks lst = (flush . nowrap . inTagsSimple "literallayout") + <$> inlinesToDocbook opts lst + | otherwise = inTagsIndented "para" <$> inlinesToDocbook opts lst +blockToDocbook opts (LineBlock lns) = + blockToDocbook opts $ linesToPara lns +blockToDocbook opts (BlockQuote blocks) = + inTagsIndented "blockquote" <$> blocksToDocbook opts blocks +blockToDocbook _ (CodeBlock (_,classes,_) str) = return $ + text ("<programlisting" ++ lang ++ ">") <> cr <> + flush (text (escapeStringForXML str) <> cr <> text "</programlisting>") + where lang = if null langs + then "" + else " language=\"" ++ escapeStringForXML (head langs) ++ + "\"" + isLang l = map toLower l `elem` map (map toLower) languages + langsFrom s = if isLang s + then [s] + else languagesByExtension . map toLower $ s + langs = concatMap langsFrom classes +blockToDocbook opts (BulletList lst) = do + let attribs = [("spacing", "compact") | isTightList lst] + inTags True "itemizedlist" attribs <$> listItemsToDocbook opts lst +blockToDocbook _ (OrderedList _ []) = return empty +blockToDocbook opts (OrderedList (start, numstyle, _) (first:rest)) = do + let numeration = case numstyle of + DefaultStyle -> [] + Decimal -> [("numeration", "arabic")] + Example -> [("numeration", "arabic")] + UpperAlpha -> [("numeration", "upperalpha")] + LowerAlpha -> [("numeration", "loweralpha")] + UpperRoman -> [("numeration", "upperroman")] + LowerRoman -> [("numeration", "lowerroman")] + spacing = [("spacing", "compact") | isTightList (first:rest)] + attribs = numeration ++ spacing + items <- if start == 1 + then listItemsToDocbook opts (first:rest) + else do + first' <- blocksToDocbook opts (map plainToPara first) + rest' <- listItemsToDocbook opts rest + return $ + (inTags True "listitem" [("override",show start)] first') $$ + rest' + return $ inTags True "orderedlist" attribs items +blockToDocbook opts (DefinitionList lst) = do + let attribs = [("spacing", "compact") | isTightList $ concatMap snd lst] + inTags True "variablelist" attribs <$> deflistItemsToDocbook opts lst +blockToDocbook _ b@(RawBlock f str) + | f == "docbook" = return $ text str -- raw XML block + | f == "html" = do + version <- ask + if version == DocBook5 + then return empty -- No html in Docbook5 + else return $ text str -- allow html for backwards compatibility + | otherwise = do + report $ BlockNotRendered b + return empty +blockToDocbook _ HorizontalRule = return empty -- not semantic +blockToDocbook opts (Table caption aligns widths headers rows) = do + captionDoc <- if null caption + then return empty + else inTagsIndented "title" <$> + inlinesToDocbook opts caption + let tableType = if isEmpty captionDoc then "informaltable" else "table" + percent w = show (truncate (100*w) :: Integer) ++ "*" + coltags = vcat $ zipWith (\w al -> selfClosingTag "colspec" + ([("colwidth", percent w) | w > 0] ++ + [("align", alignmentToString al)])) widths aligns + head' <- if all null headers + then return empty + else inTagsIndented "thead" <$> tableRowToDocbook opts headers + body' <- (inTagsIndented "tbody" . vcat) <$> + mapM (tableRowToDocbook opts) rows + return $ inTagsIndented tableType $ captionDoc $$ + (inTags True "tgroup" [("cols", show (length headers))] $ + coltags $$ head' $$ body') + +hasLineBreaks :: [Inline] -> Bool +hasLineBreaks = getAny . query isLineBreak . walk removeNote + where + removeNote :: Inline -> Inline + removeNote (Note _) = Str "" + removeNote x = x + isLineBreak :: Inline -> Any + isLineBreak LineBreak = Any True + isLineBreak _ = Any False + +alignmentToString :: Alignment -> [Char] +alignmentToString alignment = case alignment of + AlignLeft -> "left" + AlignRight -> "right" + AlignCenter -> "center" + AlignDefault -> "left" + +tableRowToDocbook :: PandocMonad m + => WriterOptions + -> [[Block]] + -> DB m Doc +tableRowToDocbook opts cols = + (inTagsIndented "row" . vcat) <$> mapM (tableItemToDocbook opts) cols + +tableItemToDocbook :: PandocMonad m + => WriterOptions + -> [Block] + -> DB m Doc +tableItemToDocbook opts item = + (inTags True "entry" [] . vcat) <$> mapM (blockToDocbook opts) item + +-- | Convert a list of inline elements to Docbook. +inlinesToDocbook :: PandocMonad m => WriterOptions -> [Inline] -> DB m Doc +inlinesToDocbook opts lst = hcat <$> mapM (inlineToDocbook opts) lst + +-- | Convert an inline element to Docbook. +inlineToDocbook :: PandocMonad m => WriterOptions -> Inline -> DB m Doc +inlineToDocbook _ (Str str) = return $ text $ escapeStringForXML str +inlineToDocbook opts (Emph lst) = + inTagsSimple "emphasis" <$> inlinesToDocbook opts lst +inlineToDocbook opts (Strong lst) = + inTags False "emphasis" [("role", "strong")] <$> inlinesToDocbook opts lst +inlineToDocbook opts (Strikeout lst) = + inTags False "emphasis" [("role", "strikethrough")] <$> + inlinesToDocbook opts lst +inlineToDocbook opts (Superscript lst) = + inTagsSimple "superscript" <$> inlinesToDocbook opts lst +inlineToDocbook opts (Subscript lst) = + inTagsSimple "subscript" <$> inlinesToDocbook opts lst +inlineToDocbook opts (SmallCaps lst) = + inTags False "emphasis" [("role", "smallcaps")] <$> + inlinesToDocbook opts lst +inlineToDocbook opts (Quoted _ lst) = + inTagsSimple "quote" <$> inlinesToDocbook opts lst +inlineToDocbook opts (Cite _ lst) = + inlinesToDocbook opts lst +inlineToDocbook opts (Span (ident,_,_) ils) = + ((if null ident + then mempty + else selfClosingTag "anchor" [("id", ident)]) <>) <$> + inlinesToDocbook opts ils +inlineToDocbook _ (Code _ str) = + return $ inTagsSimple "literal" $ text (escapeStringForXML str) +inlineToDocbook opts (Math t str) + | isMathML (writerHTMLMathMethod opts) = do + res <- convertMath writeMathML t str + case res of + Right r -> return $ inTagsSimple tagtype + $ text $ Xml.ppcElement conf + $ fixNS + $ removeAttr r + Left il -> inlineToDocbook opts il + | otherwise = + texMathToInlines t str >>= inlinesToDocbook opts + where tagtype = case t of + InlineMath -> "inlineequation" + DisplayMath -> "informalequation" + conf = Xml.useShortEmptyTags (const False) Xml.defaultConfigPP + removeAttr e = e{ Xml.elAttribs = [] } + fixNS' qname = qname{ Xml.qPrefix = Just "mml" } + fixNS = everywhere (mkT fixNS') +inlineToDocbook _ il@(RawInline f x) + | f == "html" || f == "docbook" = return $ text x + | otherwise = do + report $ InlineNotRendered il + return empty +inlineToDocbook _ LineBreak = return $ text "\n" +-- currently ignore, would require the option to add custom +-- styles to the document +inlineToDocbook _ Space = return space +-- because we use \n for LineBreak, we can't do soft breaks: +inlineToDocbook _ SoftBreak = return space +inlineToDocbook opts (Link attr txt (src, _)) + | Just email <- stripPrefix "mailto:" src = + let emailLink = inTagsSimple "email" $ text $ + escapeStringForXML $ email + in case txt of + [Str s] | escapeURI s == email -> return emailLink + _ -> do contents <- inlinesToDocbook opts txt + return $ contents <+> + char '(' <> emailLink <> char ')' + | otherwise = do + version <- ask + (if isPrefixOf "#" src + then inTags False "link" $ ("linkend", writerIdentifierPrefix opts ++ drop 1 src) : idAndRole attr + else if version == DocBook5 + then inTags False "link" $ ("xlink:href", src) : idAndRole attr + else inTags False "ulink" $ ("url", src) : idAndRole attr ) + <$> inlinesToDocbook opts txt +inlineToDocbook opts (Image attr _ (src, tit)) = return $ + let titleDoc = if null tit + then empty + else inTagsIndented "objectinfo" $ + inTagsIndented "title" (text $ escapeStringForXML tit) + in inTagsIndented "inlinemediaobject" $ inTagsIndented "imageobject" $ + titleDoc $$ imageToDocbook opts attr src +inlineToDocbook opts (Note contents) = + inTagsIndented "footnote" <$> blocksToDocbook opts contents + +isMathML :: HTMLMathMethod -> Bool +isMathML MathML = True +isMathML _ = False + +idAndRole :: Attr -> [(String, String)] +idAndRole (id',cls,_) = ident ++ role + where + ident = if null id' + then [] + else [("id", id')] + role = if null cls + then [] + else [("role", unwords cls)] diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs new file mode 100644 index 000000000..56aa29211 --- /dev/null +++ b/src/Text/Pandoc/Writers/Docx.hs @@ -0,0 +1,1302 @@ +{-# LANGUAGE ScopedTypeVariables, PatternGuards, ViewPatterns, DeriveFunctor #-} +{- +Copyright (C) 2012-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Docx + Copyright : Copyright (C) 2012-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to docx. +-} +module Text.Pandoc.Writers.Docx ( writeDocx ) where +import Data.List ( intercalate, isPrefixOf, isSuffixOf ) +import qualified Data.ByteString as B +import qualified Data.ByteString.Lazy as BL +import qualified Data.ByteString.Lazy.Char8 as BL8 +import qualified Data.Map as M +import qualified Data.Set as Set +import qualified Text.Pandoc.UTF8 as UTF8 +import Codec.Archive.Zip +import Data.Time.Clock.POSIX +import Text.Pandoc.Compat.Time +import Text.Pandoc.Definition +import Text.Pandoc.Generic +import Text.Pandoc.ImageSize +import Text.Pandoc.Shared hiding (Element) +import Text.Pandoc.Writers.Shared (fixDisplayMath) +import Text.Pandoc.Options +import Text.Pandoc.Writers.Math +import Text.Pandoc.Highlighting ( highlight ) +import Text.Pandoc.Walk +import Text.Pandoc.Error (PandocError) +import Text.XML.Light as XML +import Text.TeXMath +import Text.Pandoc.Readers.Docx.StyleMap +import Control.Monad.Reader +import Control.Monad.State +import Skylighting +import Control.Monad.Except (runExceptT) +import System.Random (randomR) +import Text.Printf (printf) +import Data.Monoid ((<>)) +import qualified Data.Text as T +import Text.Pandoc.MIME (MimeType, getMimeType, getMimeTypeDef, + extensionFromMimeType) +import Control.Applicative ((<|>)) +import Data.Maybe (fromMaybe, mapMaybe, maybeToList, isNothing) +import Data.Char (ord, isSpace, toLower) +import Text.Pandoc.Class (PandocMonad, report) +import qualified Text.Pandoc.Class as P +import Text.Pandoc.Logging + +data ListMarker = NoMarker + | BulletMarker + | NumberMarker ListNumberStyle ListNumberDelim Int + deriving (Show, Read, Eq, Ord) + +listMarkerToId :: ListMarker -> String +listMarkerToId NoMarker = "990" +listMarkerToId BulletMarker = "991" +listMarkerToId (NumberMarker sty delim n) = + '9' : '9' : styNum : delimNum : show n + where styNum = case sty of + DefaultStyle -> '2' + Example -> '3' + Decimal -> '4' + LowerRoman -> '5' + UpperRoman -> '6' + LowerAlpha -> '7' + UpperAlpha -> '8' + delimNum = case delim of + DefaultDelim -> '0' + Period -> '1' + OneParen -> '2' + TwoParens -> '3' + +data WriterEnv = WriterEnv{ envTextProperties :: [Element] + , envParaProperties :: [Element] + , envRTL :: Bool + , envListLevel :: Int + , envListNumId :: Int + , envInDel :: Bool + , envChangesAuthor :: String + , envChangesDate :: String + , envPrintWidth :: Integer + } + +defaultWriterEnv :: WriterEnv +defaultWriterEnv = WriterEnv{ envTextProperties = [] + , envParaProperties = [] + , envRTL = False + , envListLevel = -1 + , envListNumId = 1 + , envInDel = False + , envChangesAuthor = "unknown" + , envChangesDate = "1969-12-31T19:00:00Z" + , envPrintWidth = 1 + } + +data WriterState = WriterState{ + stFootnotes :: [Element] + , stSectionIds :: Set.Set String + , stExternalLinks :: M.Map String String + , stImages :: M.Map FilePath (String, String, Maybe MimeType, Element, B.ByteString) + , stLists :: [ListMarker] + , stInsId :: Int + , stDelId :: Int + , stStyleMaps :: StyleMaps + , stFirstPara :: Bool + , stTocTitle :: [Inline] + , stDynamicParaProps :: [String] + , stDynamicTextProps :: [String] + } + +defaultWriterState :: WriterState +defaultWriterState = WriterState{ + stFootnotes = defaultFootnotes + , stSectionIds = Set.empty + , stExternalLinks = M.empty + , stImages = M.empty + , stLists = [NoMarker] + , stInsId = 1 + , stDelId = 1 + , stStyleMaps = defaultStyleMaps + , stFirstPara = False + , stTocTitle = [Str "Table of Contents"] + , stDynamicParaProps = [] + , stDynamicTextProps = [] + } + +type WS m = ReaderT WriterEnv (StateT WriterState m) + +mknode :: Node t => String -> [(String,String)] -> t -> Element +mknode s attrs = + add_attrs (map (\(k,v) -> Attr (nodename k) v) attrs) . node (nodename s) + +nodename :: String -> QName +nodename s = QName{ qName = name, qURI = Nothing, qPrefix = prefix } + where (name, prefix) = case break (==':') s of + (xs,[]) -> (xs, Nothing) + (ys, _:zs) -> (zs, Just ys) + +toLazy :: B.ByteString -> BL.ByteString +toLazy = BL.fromChunks . (:[]) + +renderXml :: Element -> BL.ByteString +renderXml elt = BL8.pack "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" <> + UTF8.fromStringLazy (showElement elt) + +renumIdMap :: Int -> [Element] -> M.Map String String +renumIdMap _ [] = M.empty +renumIdMap n (e:es) + | Just oldId <- findAttr (QName "Id" Nothing Nothing) e = + M.insert oldId ("rId" ++ (show n)) (renumIdMap (n+1) es) + | otherwise = renumIdMap n es + +replaceAttr :: (QName -> Bool) -> String -> [XML.Attr] -> [XML.Attr] +replaceAttr _ _ [] = [] +replaceAttr f val (a:as) | f (attrKey a) = + (XML.Attr (attrKey a) val) : (replaceAttr f val as) + | otherwise = a : (replaceAttr f val as) + +renumId :: (QName -> Bool) -> (M.Map String String) -> Element -> Element +renumId f renumMap e + | Just oldId <- findAttrBy f e + , Just newId <- M.lookup oldId renumMap = + let attrs' = replaceAttr f newId (elAttribs e) + in + e { elAttribs = attrs' } + | otherwise = e + +renumIds :: (QName -> Bool) -> (M.Map String String) -> [Element] -> [Element] +renumIds f renumMap = map (renumId f renumMap) + +-- | Certain characters are invalid in XML even if escaped. +-- See #1992 +stripInvalidChars :: String -> String +stripInvalidChars = filter isValidChar + +-- | See XML reference +isValidChar :: Char -> Bool +isValidChar (ord -> c) + | c == 0x9 = True + | c == 0xA = True + | c == 0xD = True + | 0x20 <= c && c <= 0xD7FF = True + | 0xE000 <= c && c <= 0xFFFD = True + | 0x10000 <= c && c <= 0x10FFFF = True + | otherwise = False + +metaValueToInlines :: MetaValue -> [Inline] +metaValueToInlines (MetaString s) = [Str s] +metaValueToInlines (MetaInlines ils) = ils +metaValueToInlines (MetaBlocks bs) = query return bs +metaValueToInlines (MetaBool b) = [Str $ show b] +metaValueToInlines _ = [] + + + +writeDocx :: (PandocMonad m) + => WriterOptions -- ^ Writer options + -> Pandoc -- ^ Document to convert + -> m BL.ByteString +writeDocx opts doc@(Pandoc meta _) = do + let datadir = writerUserDataDir opts + let doc' = walk fixDisplayMath $ doc + username <- P.lookupEnv "USERNAME" + utctime <- P.getCurrentTime + distArchive <- (toArchive . BL.fromStrict) <$> + P.readDataFile datadir "reference.docx" + refArchive <- case writerReferenceDoc opts of + Just f -> toArchive <$> P.readFileLazy f + Nothing -> return distArchive + + parsedDoc <- parseXml refArchive distArchive "word/document.xml" + let wname f qn = qPrefix qn == Just "w" && f (qName qn) + let mbsectpr = filterElementName (wname (=="sectPr")) parsedDoc + + -- Gets the template size + let mbpgsz = mbsectpr >>= (filterElementName (wname (=="pgSz"))) + let mbAttrSzWidth = (elAttribs <$> mbpgsz) >>= (lookupAttrBy ((=="w") . qName)) + + let mbpgmar = mbsectpr >>= (filterElementName (wname (=="pgMar"))) + let mbAttrMarLeft = (elAttribs <$> mbpgmar) >>= (lookupAttrBy ((=="left") . qName)) + let mbAttrMarRight = (elAttribs <$> mbpgmar) >>= (lookupAttrBy ((=="right") . qName)) + + -- Get the avaible area (converting the size and the margins to int and + -- doing the difference + let pgContentWidth = (-) <$> (read <$> mbAttrSzWidth ::Maybe Integer) + <*> ( + (+) <$> (read <$> mbAttrMarRight ::Maybe Integer) + <*> (read <$> mbAttrMarLeft ::Maybe Integer) + ) + + -- styles + let stylepath = "word/styles.xml" + styledoc <- parseXml refArchive distArchive stylepath + + -- parse styledoc for heading styles + let styleMaps = getStyleMaps styledoc + + let tocTitle = fromMaybe (stTocTitle defaultWriterState) $ + metaValueToInlines <$> lookupMeta "toc-title" meta + + let initialSt = defaultWriterState { + stStyleMaps = styleMaps + , stTocTitle = tocTitle + } + + let isRTLmeta = case lookupMeta "dir" meta of + Just (MetaString "rtl") -> True + Just (MetaInlines [Str "rtl"]) -> True + _ -> False + + let env = defaultWriterEnv { + envRTL = isRTLmeta + , envChangesAuthor = fromMaybe "unknown" username + , envChangesDate = formatTime defaultTimeLocale "%FT%XZ" utctime + , envPrintWidth = (maybe 420 (\x -> quot x 20) pgContentWidth) + } + + + ((contents, footnotes), st) <- runStateT + (runReaderT + (writeOpenXML opts{writerWrapText = WrapNone} doc') + env) + initialSt + let epochtime = floor $ utcTimeToPOSIXSeconds utctime + let imgs = M.elems $ stImages st + + -- create entries for images in word/media/... + let toImageEntry (_,path,_,_,img) = toEntry ("word/" ++ path) epochtime $ toLazy img + let imageEntries = map toImageEntry imgs + + let stdAttributes = + [("xmlns:w","http://schemas.openxmlformats.org/wordprocessingml/2006/main") + ,("xmlns:m","http://schemas.openxmlformats.org/officeDocument/2006/math") + ,("xmlns:r","http://schemas.openxmlformats.org/officeDocument/2006/relationships") + ,("xmlns:o","urn:schemas-microsoft-com:office:office") + ,("xmlns:v","urn:schemas-microsoft-com:vml") + ,("xmlns:w10","urn:schemas-microsoft-com:office:word") + ,("xmlns:a","http://schemas.openxmlformats.org/drawingml/2006/main") + ,("xmlns:pic","http://schemas.openxmlformats.org/drawingml/2006/picture") + ,("xmlns:wp","http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing")] + + + parsedRels <- parseXml refArchive distArchive "word/_rels/document.xml.rels" + let isHeaderNode e = findAttr (QName "Type" Nothing Nothing) e == Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" + let isFooterNode e = findAttr (QName "Type" Nothing Nothing) e == Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" + let headers = filterElements isHeaderNode parsedRels + let footers = filterElements isFooterNode parsedRels + + let extractTarget = findAttr (QName "Target" Nothing Nothing) + + -- we create [Content_Types].xml and word/_rels/document.xml.rels + -- from scratch rather than reading from reference.docx, + -- because Word sometimes changes these files when a reference.docx is modified, + -- e.g. deleting the reference to footnotes.xml or removing default entries + -- for image content types. + + -- [Content_Types].xml + let mkOverrideNode (part', contentType') = mknode "Override" + [("PartName",part'),("ContentType",contentType')] () + let mkImageOverride (_, imgpath, mbMimeType, _, _) = + mkOverrideNode ("/word/" ++ imgpath, + fromMaybe "application/octet-stream" mbMimeType) + let mkMediaOverride imgpath = + mkOverrideNode ('/':imgpath, getMimeTypeDef imgpath) + let overrides = map mkOverrideNode ( + [("/word/webSettings.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml") + ,("/word/numbering.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml") + ,("/word/settings.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml") + ,("/word/theme/theme1.xml", + "application/vnd.openxmlformats-officedocument.theme+xml") + ,("/word/fontTable.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml") + ,("/docProps/app.xml", + "application/vnd.openxmlformats-officedocument.extended-properties+xml") + ,("/docProps/core.xml", + "application/vnd.openxmlformats-package.core-properties+xml") + ,("/word/styles.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml") + ,("/word/document.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml") + ,("/word/footnotes.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml") + ] ++ + map (\x -> (maybe "" ("/word/" ++) $ extractTarget x, + "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml")) headers ++ + map (\x -> (maybe "" ("/word/" ++) $ extractTarget x, + "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml")) footers) ++ + map mkImageOverride imgs ++ + map mkMediaOverride [ eRelativePath e | e <- zEntries refArchive + , "word/media/" `isPrefixOf` eRelativePath e ] + + let defaultnodes = [mknode "Default" + [("Extension","xml"),("ContentType","application/xml")] (), + mknode "Default" + [("Extension","rels"),("ContentType","application/vnd.openxmlformats-package.relationships+xml")] ()] + let contentTypesDoc = mknode "Types" [("xmlns","http://schemas.openxmlformats.org/package/2006/content-types")] $ defaultnodes ++ overrides + let contentTypesEntry = toEntry "[Content_Types].xml" epochtime + $ renderXml contentTypesDoc + + -- word/_rels/document.xml.rels + let toBaseRel (url', id', target') = mknode "Relationship" + [("Type",url') + ,("Id",id') + ,("Target",target')] () + let baserels' = map toBaseRel + [("http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering", + "rId1", + "numbering.xml") + ,("http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles", + "rId2", + "styles.xml") + ,("http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings", + "rId3", + "settings.xml") + ,("http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings", + "rId4", + "webSettings.xml") + ,("http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable", + "rId5", + "fontTable.xml") + ,("http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme", + "rId6", + "theme/theme1.xml") + ,("http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes", + "rId7", + "footnotes.xml") + ] + + let idMap = renumIdMap (length baserels' + 1) (headers ++ footers) + let renumHeaders = renumIds (\q -> qName q == "Id") idMap headers + let renumFooters = renumIds (\q -> qName q == "Id") idMap footers + let baserels = baserels' ++ renumHeaders ++ renumFooters + let toImgRel (ident,path,_,_,_) = mknode "Relationship" [("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"),("Id",ident),("Target",path)] () + let imgrels = map toImgRel imgs + let toLinkRel (src,ident) = mknode "Relationship" [("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"),("Id",ident),("Target",src),("TargetMode","External") ] () + let linkrels = map toLinkRel $ M.toList $ stExternalLinks st + let reldoc = mknode "Relationships" [("xmlns","http://schemas.openxmlformats.org/package/2006/relationships")] $ baserels ++ imgrels ++ linkrels + let relEntry = toEntry "word/_rels/document.xml.rels" epochtime + $ renderXml reldoc + + + -- adjust contents to add sectPr from reference.docx + let sectpr = case mbsectpr of + Just sectpr' -> let cs = renumIds + (\q -> qName q == "id" && qPrefix q == Just "r") + idMap + (elChildren sectpr') + in + add_attrs (elAttribs sectpr') $ mknode "w:sectPr" [] cs + Nothing -> (mknode "w:sectPr" [] ()) + + -- let sectpr = fromMaybe (mknode "w:sectPr" [] ()) mbsectpr' + let contents' = contents ++ [sectpr] + let docContents = mknode "w:document" stdAttributes + $ mknode "w:body" [] contents' + + + + -- word/document.xml + let contentEntry = toEntry "word/document.xml" epochtime + $ renderXml docContents + + -- footnotes + let notes = mknode "w:footnotes" stdAttributes footnotes + let footnotesEntry = toEntry "word/footnotes.xml" epochtime $ renderXml notes + + -- footnote rels + let footnoteRelEntry = toEntry "word/_rels/footnotes.xml.rels" epochtime + $ renderXml $ mknode "Relationships" [("xmlns","http://schemas.openxmlformats.org/package/2006/relationships")] + linkrels + + -- styles + + -- We only want to inject paragraph and text properties that + -- are not already in the style map. Note that keys in the stylemap + -- are normalized as lowercase. + let newDynamicParaProps = filter + (\sty -> isNothing $ M.lookup (toLower <$> sty) $ getMap $ sParaStyleMap styleMaps) + (stDynamicParaProps st) + + newDynamicTextProps = filter + (\sty -> isNothing $ M.lookup (toLower <$> sty) $ getMap $ sCharStyleMap styleMaps) + (stDynamicTextProps st) + + let newstyles = map newParaPropToOpenXml newDynamicParaProps ++ + map newTextPropToOpenXml newDynamicTextProps ++ + (case writerHighlightStyle opts of + Nothing -> [] + Just sty -> (styleToOpenXml styleMaps sty)) + let styledoc' = styledoc{ elContent = elContent styledoc ++ + map Elem newstyles } + let styleEntry = toEntry stylepath epochtime $ renderXml styledoc' + + -- construct word/numbering.xml + let numpath = "word/numbering.xml" + numbering <- parseXml refArchive distArchive numpath + newNumElts <- mkNumbering (stLists st) + let allElts = onlyElems (elContent numbering) ++ newNumElts + let numEntry = toEntry numpath epochtime $ renderXml numbering{ elContent = + -- we want all the abstractNums first, then the nums, + -- otherwise things break: + [Elem e | e <- allElts + , qName (elName e) == "abstractNum" ] ++ + [Elem e | e <- allElts + , qName (elName e) == "num" ] } + let docPropsPath = "docProps/core.xml" + let docProps = mknode "cp:coreProperties" + [("xmlns:cp","http://schemas.openxmlformats.org/package/2006/metadata/core-properties") + ,("xmlns:dc","http://purl.org/dc/elements/1.1/") + ,("xmlns:dcterms","http://purl.org/dc/terms/") + ,("xmlns:dcmitype","http://purl.org/dc/dcmitype/") + ,("xmlns:xsi","http://www.w3.org/2001/XMLSchema-instance")] + $ mknode "dc:title" [] (stringify $ docTitle meta) + : mknode "dc:creator" [] (intercalate "; " (map stringify $ docAuthors meta)) + : (\x -> [ mknode "dcterms:created" [("xsi:type","dcterms:W3CDTF")] x + , mknode "dcterms:modified" [("xsi:type","dcterms:W3CDTF")] x + ]) (formatTime defaultTimeLocale "%FT%XZ" utctime) + let docPropsEntry = toEntry docPropsPath epochtime $ renderXml docProps + + let relsPath = "_rels/.rels" + let rels = mknode "Relationships" [("xmlns", "http://schemas.openxmlformats.org/package/2006/relationships")] + $ map (\attrs -> mknode "Relationship" attrs ()) + [ [("Id","rId1") + ,("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument") + ,("Target","word/document.xml")] + , [("Id","rId4") + ,("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties") + ,("Target","docProps/app.xml")] + , [("Id","rId3") + ,("Type","http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties") + ,("Target","docProps/core.xml")] + ] + let relsEntry = toEntry relsPath epochtime $ renderXml rels + + -- we use dist archive for settings.xml, because Word sometimes + -- adds references to footnotes or endnotes we don't have... + -- we do, however, copy some settings over from reference + let settingsPath = "word/settings.xml" + settingsList = [ "w:autoHyphenation" + , "w:consecutiveHyphenLimit" + , "w:hyphenationZone" + , "w:doNotHyphenateCap" + ] + settingsEntry <- copyChildren refArchive distArchive settingsPath epochtime settingsList + + let entryFromArchive arch path = + maybe (fail $ path ++ " missing in reference docx") + return + (findEntryByPath path arch `mplus` findEntryByPath path distArchive) + docPropsAppEntry <- entryFromArchive refArchive "docProps/app.xml" + themeEntry <- entryFromArchive refArchive "word/theme/theme1.xml" + fontTableEntry <- entryFromArchive refArchive "word/fontTable.xml" + webSettingsEntry <- entryFromArchive refArchive "word/webSettings.xml" + headerFooterEntries <- mapM (entryFromArchive refArchive) $ + mapMaybe (fmap ("word/" ++) . extractTarget) + (headers ++ footers) + let miscRelEntries = [ e | e <- zEntries refArchive + , "word/_rels/" `isPrefixOf` (eRelativePath e) + , ".xml.rels" `isSuffixOf` (eRelativePath e) + , eRelativePath e /= "word/_rels/document.xml.rels" + , eRelativePath e /= "word/_rels/footnotes.xml.rels" ] + let otherMediaEntries = [ e | e <- zEntries refArchive + , "word/media/" `isPrefixOf` eRelativePath e ] + + -- Create archive + let archive = foldr addEntryToArchive emptyArchive $ + contentTypesEntry : relsEntry : contentEntry : relEntry : + footnoteRelEntry : numEntry : styleEntry : footnotesEntry : + docPropsEntry : docPropsAppEntry : themeEntry : + fontTableEntry : settingsEntry : webSettingsEntry : + imageEntries ++ headerFooterEntries ++ + miscRelEntries ++ otherMediaEntries + return $ fromArchive archive + + +newParaPropToOpenXml :: String -> Element +newParaPropToOpenXml s = + let styleId = filter (not . isSpace) s + in mknode "w:style" [ ("w:type", "paragraph") + , ("w:customStyle", "1") + , ("w:styleId", styleId)] + [ mknode "w:name" [("w:val", s)] () + , mknode "w:basedOn" [("w:val","BodyText")] () + , mknode "w:qFormat" [] () + ] + +newTextPropToOpenXml :: String -> Element +newTextPropToOpenXml s = + let styleId = filter (not . isSpace) s + in mknode "w:style" [ ("w:type", "character") + , ("w:customStyle", "1") + , ("w:styleId", styleId)] + [ mknode "w:name" [("w:val", s)] () + , mknode "w:basedOn" [("w:val","BodyTextChar")] () + ] + +styleToOpenXml :: StyleMaps -> Style -> [Element] +styleToOpenXml sm style = + maybeToList parStyle ++ mapMaybe toStyle alltoktypes + where alltoktypes = enumFromTo KeywordTok NormalTok + toStyle toktype | hasStyleName (show toktype) (sCharStyleMap sm) = Nothing + | otherwise = Just $ + mknode "w:style" [("w:type","character"), + ("w:customStyle","1"),("w:styleId",show toktype)] + [ mknode "w:name" [("w:val",show toktype)] () + , mknode "w:basedOn" [("w:val","VerbatimChar")] () + , mknode "w:rPr" [] $ + [ mknode "w:color" [("w:val",tokCol toktype)] () + | tokCol toktype /= "auto" ] ++ + [ mknode "w:shd" [("w:val","clear"),("w:fill",tokBg toktype)] () + | tokBg toktype /= "auto" ] ++ + [ mknode "w:b" [] () | tokFeature tokenBold toktype ] ++ + [ mknode "w:i" [] () | tokFeature tokenItalic toktype ] ++ + [ mknode "w:u" [] () | tokFeature tokenUnderline toktype ] + ] + tokStyles = tokenStyles style + tokFeature f toktype = maybe False f $ lookup toktype tokStyles + tokCol toktype = maybe "auto" (drop 1 . fromColor) + $ (tokenColor =<< lookup toktype tokStyles) + `mplus` defaultColor style + tokBg toktype = maybe "auto" (drop 1 . fromColor) + $ (tokenBackground =<< lookup toktype tokStyles) + `mplus` backgroundColor style + parStyle | hasStyleName "Source Code" (sParaStyleMap sm) = Nothing + | otherwise = Just $ + mknode "w:style" [("w:type","paragraph"), + ("w:customStyle","1"),("w:styleId","SourceCode")] + [ mknode "w:name" [("w:val","Source Code")] () + , mknode "w:basedOn" [("w:val","Normal")] () + , mknode "w:link" [("w:val","VerbatimChar")] () + , mknode "w:pPr" [] + $ mknode "w:wordWrap" [("w:val","off")] () + : ( maybe [] (\col -> [mknode "w:shd" [("w:val","clear"),("w:fill",drop 1 $ fromColor col)] ()]) + $ backgroundColor style ) + ] + +copyChildren :: (PandocMonad m) => Archive -> Archive -> String -> Integer -> [String] -> m Entry +copyChildren refArchive distArchive path timestamp elNames = do + ref <- parseXml refArchive distArchive path + dist <- parseXml distArchive distArchive path + return $ toEntry path timestamp $ renderXml dist{ + elContent = elContent dist ++ copyContent ref + } + where + strName QName{qName=name, qPrefix=prefix} + | Just p <- prefix = p++":"++name + | otherwise = name + shouldCopy = (`elem` elNames) . strName + cleanElem el@Element{elName=name} = Elem el{elName=name{qURI=Nothing}} + copyContent = map cleanElem . filterChildrenName shouldCopy + +-- this is the lowest number used for a list numId +baseListId :: Int +baseListId = 1000 + +mkNumbering :: (PandocMonad m) => [ListMarker] -> m [Element] +mkNumbering lists = do + elts <- mapM mkAbstractNum (ordNub lists) + return $ elts ++ zipWith mkNum lists [baseListId..(baseListId + length lists - 1)] + +mkNum :: ListMarker -> Int -> Element +mkNum marker numid = + mknode "w:num" [("w:numId",show numid)] + $ mknode "w:abstractNumId" [("w:val",listMarkerToId marker)] () + : case marker of + NoMarker -> [] + BulletMarker -> [] + NumberMarker _ _ start -> + map (\lvl -> mknode "w:lvlOverride" [("w:ilvl",show (lvl :: Int))] + $ mknode "w:startOverride" [("w:val",show start)] ()) [0..6] + +mkAbstractNum :: (PandocMonad m) => ListMarker -> m Element +mkAbstractNum marker = do + gen <- P.newStdGen + let (nsid, _) = randomR (0x10000000 :: Integer, 0xFFFFFFFF :: Integer) gen + return $ mknode "w:abstractNum" [("w:abstractNumId",listMarkerToId marker)] + $ mknode "w:nsid" [("w:val", printf "%8x" nsid)] () + : mknode "w:multiLevelType" [("w:val","multilevel")] () + : map (mkLvl marker) [0..6] + +mkLvl :: ListMarker -> Int -> Element +mkLvl marker lvl = + mknode "w:lvl" [("w:ilvl",show lvl)] $ + [ mknode "w:start" [("w:val",start)] () + | marker /= NoMarker && marker /= BulletMarker ] ++ + [ mknode "w:numFmt" [("w:val",fmt)] () + , mknode "w:lvlText" [("w:val",lvltxt)] () + , mknode "w:lvlJc" [("w:val","left")] () + , mknode "w:pPr" [] + [ mknode "w:tabs" [] + $ mknode "w:tab" [("w:val","num"),("w:pos",show $ lvl * step)] () + , mknode "w:ind" [("w:left",show $ lvl * step + hang),("w:hanging",show hang)] () + ] + ] + where (fmt, lvltxt, start) = + case marker of + NoMarker -> ("bullet"," ","1") + BulletMarker -> ("bullet",bulletFor lvl,"1") + NumberMarker st de n -> (styleFor st lvl + ,patternFor de ("%" ++ show (lvl + 1)) + ,show n) + step = 720 + hang = 480 + bulletFor 0 = "\x2022" -- filled circle + bulletFor 1 = "\x2013" -- en dash + bulletFor 2 = "\x2022" -- hyphen bullet + bulletFor 3 = "\x2013" + bulletFor 4 = "\x2022" + bulletFor 5 = "\x2013" + bulletFor _ = "\x2022" + styleFor UpperAlpha _ = "upperLetter" + styleFor LowerAlpha _ = "lowerLetter" + styleFor UpperRoman _ = "upperRoman" + styleFor LowerRoman _ = "lowerRoman" + styleFor Decimal _ = "decimal" + styleFor DefaultStyle 1 = "decimal" + styleFor DefaultStyle 2 = "lowerLetter" + styleFor DefaultStyle 3 = "lowerRoman" + styleFor DefaultStyle 4 = "decimal" + styleFor DefaultStyle 5 = "lowerLetter" + styleFor DefaultStyle 6 = "lowerRoman" + styleFor _ _ = "decimal" + patternFor OneParen s = s ++ ")" + patternFor TwoParens s = "(" ++ s ++ ")" + patternFor _ s = s ++ "." + +getNumId :: (PandocMonad m) => WS m Int +getNumId = (((baseListId - 1) +) . length) `fmap` gets stLists + + +makeTOC :: (PandocMonad m) => WriterOptions -> WS m [Element] +makeTOC opts | writerTableOfContents opts = do + let depth = "1-"++(show (writerTOCDepth opts)) + let tocCmd = "TOC \\o \""++depth++"\" \\h \\z \\u" + tocTitle <- gets stTocTitle + title <- withParaPropM (pStyleM "TOC Heading") (blocksToOpenXML opts [Para tocTitle]) + return $ + [mknode "w:sdt" [] ([ + mknode "w:sdtPr" [] ( + mknode "w:docPartObj" [] ( + [mknode "w:docPartGallery" [("w:val","Table of Contents")] (), + mknode "w:docPartUnique" [] ()] + ) -- w:docPartObj + ), -- w:sdtPr + mknode "w:sdtContent" [] (title++[ + mknode "w:p" [] ( + mknode "w:r" [] ([ + mknode "w:fldChar" [("w:fldCharType","begin"),("w:dirty","true")] (), + mknode "w:instrText" [("xml:space","preserve")] tocCmd, + mknode "w:fldChar" [("w:fldCharType","separate")] (), + mknode "w:fldChar" [("w:fldCharType","end")] () + ]) -- w:r + ) -- w:p + ]) + ])] -- w:sdt +makeTOC _ = return [] + + +-- | Convert Pandoc document to two lists of +-- OpenXML elements (the main document and footnotes). +writeOpenXML :: (PandocMonad m) => WriterOptions -> Pandoc -> WS m ([Element], [Element]) +writeOpenXML opts (Pandoc meta blocks) = do + let tit = docTitle meta ++ case lookupMeta "subtitle" meta of + Just (MetaBlocks [Plain xs]) -> LineBreak : xs + _ -> [] + let auths = docAuthors meta + let dat = docDate meta + let abstract' = case lookupMeta "abstract" meta of + Just (MetaBlocks bs) -> bs + Just (MetaInlines ils) -> [Plain ils] + _ -> [] + let subtitle' = case lookupMeta "subtitle" meta of + Just (MetaBlocks [Plain xs]) -> xs + Just (MetaBlocks [Para xs]) -> xs + Just (MetaInlines xs) -> xs + _ -> [] + title <- withParaPropM (pStyleM "Title") $ blocksToOpenXML opts [Para tit | not (null tit)] + subtitle <- withParaPropM (pStyleM "Subtitle") $ blocksToOpenXML opts [Para subtitle' | not (null subtitle')] + authors <- withParaProp (pCustomStyle "Author") $ blocksToOpenXML opts $ + map Para auths + date <- withParaPropM (pStyleM "Date") $ blocksToOpenXML opts [Para dat | not (null dat)] + abstract <- if null abstract' + then return [] + else withParaProp (pCustomStyle "Abstract") $ blocksToOpenXML opts abstract' + let convertSpace (Str x : Space : Str y : xs) = Str (x ++ " " ++ y) : xs + convertSpace (Str x : Str y : xs) = Str (x ++ y) : xs + convertSpace xs = xs + let blocks' = bottomUp convertSpace blocks + doc' <- (setFirstPara >> blocksToOpenXML opts blocks') + notes' <- reverse `fmap` gets stFootnotes + toc <- makeTOC opts + let meta' = title ++ subtitle ++ authors ++ date ++ abstract ++ toc + return (meta' ++ doc', notes') + +-- | Convert a list of Pandoc blocks to OpenXML. +blocksToOpenXML :: (PandocMonad m) => WriterOptions -> [Block] -> WS m [Element] +blocksToOpenXML opts bls = concat `fmap` mapM (blockToOpenXML opts) bls + +pCustomStyle :: String -> Element +pCustomStyle sty = mknode "w:pStyle" [("w:val",sty)] () + +pStyleM :: (PandocMonad m) => String -> WS m XML.Element +pStyleM styleName = do + styleMaps <- gets stStyleMaps + let sty' = getStyleId styleName $ sParaStyleMap styleMaps + return $ mknode "w:pStyle" [("w:val",sty')] () + +rCustomStyle :: String -> Element +rCustomStyle sty = mknode "w:rStyle" [("w:val",sty)] () + +rStyleM :: (PandocMonad m) => String -> WS m XML.Element +rStyleM styleName = do + styleMaps <- gets stStyleMaps + let sty' = getStyleId styleName $ sCharStyleMap styleMaps + return $ mknode "w:rStyle" [("w:val",sty')] () + +getUniqueId :: (PandocMonad m) => m String +-- the + 20 is to ensure that there are no clashes with the rIds +-- already in word/document.xml.rel +getUniqueId = (show . (+ 20)) <$> P.newUniqueHash + +-- | Key for specifying user-defined docx styles. +dynamicStyleKey :: String +dynamicStyleKey = "custom-style" + +-- | Convert a Pandoc block element to OpenXML. +blockToOpenXML :: (PandocMonad m) => WriterOptions -> Block -> WS m [Element] +blockToOpenXML opts blk = withDirection $ blockToOpenXML' opts blk + +blockToOpenXML' :: (PandocMonad m) => WriterOptions -> Block -> WS m [Element] +blockToOpenXML' _ Null = return [] +blockToOpenXML' opts (Div (ident,classes,kvs) bs) + | Just sty <- lookup dynamicStyleKey kvs = do + modify $ \s -> s{stDynamicParaProps = sty : (stDynamicParaProps s)} + withParaPropM (pStyleM sty) $ blocksToOpenXML opts bs + | Just "rtl" <- lookup "dir" kvs = do + let kvs' = filter (("dir", "rtl")/=) kvs + local (\env -> env { envRTL = True }) $ + blockToOpenXML opts (Div (ident,classes,kvs') bs) + | Just "ltr" <- lookup "dir" kvs = do + let kvs' = filter (("dir", "ltr")/=) kvs + local (\env -> env { envRTL = False }) $ + blockToOpenXML opts (Div (ident,classes,kvs') bs) +blockToOpenXML' opts (Div (_,["references"],_) bs) = do + let (hs, bs') = span isHeaderBlock bs + header <- blocksToOpenXML opts hs + -- We put the Bibliography style on paragraphs after the header + rest <- withParaPropM (pStyleM "Bibliography") $ blocksToOpenXML opts bs' + return (header ++ rest) +blockToOpenXML' opts (Div _ bs) = blocksToOpenXML opts bs +blockToOpenXML' opts (Header lev (ident,_,_) lst) = do + setFirstPara + paraProps <- withParaPropM (pStyleM ("Heading "++show lev)) $ + getParaProps False + contents <- inlinesToOpenXML opts lst + usedIdents <- gets stSectionIds + let bookmarkName = if null ident + then uniqueIdent lst usedIdents + else ident + modify $ \s -> s{ stSectionIds = Set.insert bookmarkName $ stSectionIds s } + id' <- (lift . lift) getUniqueId + let bookmarkStart = mknode "w:bookmarkStart" [("w:id", id') + ,("w:name",bookmarkName)] () + let bookmarkEnd = mknode "w:bookmarkEnd" [("w:id", id')] () + return [mknode "w:p" [] (paraProps ++ [bookmarkStart, bookmarkEnd] ++ contents)] +blockToOpenXML' opts (Plain lst) = withParaProp (pCustomStyle "Compact") + $ blockToOpenXML opts (Para lst) +-- title beginning with fig: indicates that the image is a figure +blockToOpenXML' opts (Para [Image attr alt (src,'f':'i':'g':':':tit)]) = do + setFirstPara + let prop = pCustomStyle $ + if null alt + then "Figure" + else "FigureWithCaption" + paraProps <- local (\env -> env { envParaProperties = prop : envParaProperties env }) (getParaProps False) + contents <- inlinesToOpenXML opts [Image attr alt (src,tit)] + captionNode <- withParaProp (pCustomStyle "ImageCaption") + $ blockToOpenXML opts (Para alt) + return $ mknode "w:p" [] (paraProps ++ contents) : captionNode +-- fixDisplayMath sometimes produces a Para [] as artifact +blockToOpenXML' _ (Para []) = return [] +blockToOpenXML' opts (Para lst) = do + isFirstPara <- gets stFirstPara + paraProps <- getParaProps $ case lst of + [Math DisplayMath _] -> True + _ -> False + bodyTextStyle <- pStyleM "Body Text" + let paraProps' = case paraProps of + [] | isFirstPara -> [mknode "w:pPr" [] [pCustomStyle "FirstParagraph"]] + [] -> [mknode "w:pPr" [] [bodyTextStyle]] + ps -> ps + modify $ \s -> s { stFirstPara = False } + contents <- inlinesToOpenXML opts lst + return [mknode "w:p" [] (paraProps' ++ contents)] +blockToOpenXML' opts (LineBlock lns) = blockToOpenXML opts $ linesToPara lns +blockToOpenXML' _ b@(RawBlock format str) + | format == Format "openxml" = return [ x | Elem x <- parseXML str ] + | otherwise = do + report $ BlockNotRendered b + return [] +blockToOpenXML' opts (BlockQuote blocks) = do + p <- withParaPropM (pStyleM "Block Text") $ blocksToOpenXML opts blocks + setFirstPara + return p +blockToOpenXML' opts (CodeBlock attrs str) = do + p <- withParaProp (pCustomStyle "SourceCode") (blockToOpenXML opts $ Para [Code attrs str]) + setFirstPara + return p +blockToOpenXML' _ HorizontalRule = do + setFirstPara + return [ + mknode "w:p" [] $ mknode "w:r" [] $ mknode "w:pict" [] + $ mknode "v:rect" [("style","width:0;height:1.5pt"), + ("o:hralign","center"), + ("o:hrstd","t"),("o:hr","t")] () ] +blockToOpenXML' opts (Table caption aligns widths headers rows) = do + setFirstPara + let captionStr = stringify caption + caption' <- if null caption + then return [] + else withParaProp (pCustomStyle "TableCaption") + $ blockToOpenXML opts (Para caption) + let alignmentFor al = mknode "w:jc" [("w:val",alignmentToString al)] () + let cellToOpenXML (al, cell) = withParaProp (alignmentFor al) + $ blocksToOpenXML opts cell + headers' <- mapM cellToOpenXML $ zip aligns headers + rows' <- mapM (mapM cellToOpenXML . zip aligns) rows + let borderProps = mknode "w:tcPr" [] + [ mknode "w:tcBorders" [] + $ mknode "w:bottom" [("w:val","single")] () + , mknode "w:vAlign" [("w:val","bottom")] () ] + let emptyCell = [mknode "w:p" [] [mknode "w:pPr" [] [pCustomStyle "Compact"]]] + let mkcell border contents = mknode "w:tc" [] + $ [ borderProps | border ] ++ + if null contents + then emptyCell + else contents + let mkrow border cells = mknode "w:tr" [] $ + [mknode "w:trPr" [] [ + mknode "w:cnfStyle" [("w:firstRow","1")] ()] | border] + ++ map (mkcell border) cells + let textwidth = 7920 -- 5.5 in in twips, 1/20 pt + let fullrow = 5000 -- 100% specified in pct + let rowwidth = fullrow * sum widths + let mkgridcol w = mknode "w:gridCol" + [("w:w", show (floor (textwidth * w) :: Integer))] () + let hasHeader = not (all null headers) + return $ + caption' ++ + [mknode "w:tbl" [] + ( mknode "w:tblPr" [] + ( mknode "w:tblStyle" [("w:val","TableNormal")] () : + mknode "w:tblW" [("w:type", "pct"), ("w:w", show rowwidth)] () : + mknode "w:tblLook" [("w:firstRow","1") | hasHeader ] () : + [ mknode "w:tblCaption" [("w:val", captionStr)] () + | not (null caption) ] ) + : mknode "w:tblGrid" [] + (if all (==0) widths + then [] + else map mkgridcol widths) + : [ mkrow True headers' | hasHeader ] ++ + map (mkrow False) rows' + )] +blockToOpenXML' opts (BulletList lst) = do + let marker = BulletMarker + addList marker + numid <- getNumId + l <- asList $ concat `fmap` mapM (listItemToOpenXML opts numid) lst + setFirstPara + return l +blockToOpenXML' opts (OrderedList (start, numstyle, numdelim) lst) = do + let marker = NumberMarker numstyle numdelim start + addList marker + numid <- getNumId + l <- asList $ concat `fmap` mapM (listItemToOpenXML opts numid) lst + setFirstPara + return l +blockToOpenXML' opts (DefinitionList items) = do + l <- concat `fmap` mapM (definitionListItemToOpenXML opts) items + setFirstPara + return l + +definitionListItemToOpenXML :: (PandocMonad m) => WriterOptions -> ([Inline],[[Block]]) -> WS m [Element] +definitionListItemToOpenXML opts (term,defs) = do + term' <- withParaProp (pCustomStyle "DefinitionTerm") + $ blockToOpenXML opts (Para term) + defs' <- withParaProp (pCustomStyle "Definition") + $ concat `fmap` mapM (blocksToOpenXML opts) defs + return $ term' ++ defs' + +addList :: (PandocMonad m) => ListMarker -> WS m () +addList marker = do + lists <- gets stLists + modify $ \st -> st{ stLists = lists ++ [marker] } + +listItemToOpenXML :: (PandocMonad m) => WriterOptions -> Int -> [Block] -> WS m [Element] +listItemToOpenXML _ _ [] = return [] +listItemToOpenXML opts numid (first:rest) = do + first' <- withNumId numid $ blockToOpenXML opts first + -- baseListId is the code for no list marker: + rest' <- withNumId baseListId $ blocksToOpenXML opts rest + return $ first' ++ rest' + +alignmentToString :: Alignment -> [Char] +alignmentToString alignment = case alignment of + AlignLeft -> "left" + AlignRight -> "right" + AlignCenter -> "center" + AlignDefault -> "left" + +-- | Convert a list of inline elements to OpenXML. +inlinesToOpenXML :: (PandocMonad m) => WriterOptions -> [Inline] -> WS m [Element] +inlinesToOpenXML opts lst = concat `fmap` mapM (inlineToOpenXML opts) lst + +withNumId :: (PandocMonad m) => Int -> WS m a -> WS m a +withNumId numid = local $ \env -> env{ envListNumId = numid } + +asList :: (PandocMonad m) => WS m a -> WS m a +asList = local $ \env -> env{ envListLevel = envListLevel env + 1 } + +getTextProps :: (PandocMonad m) => WS m [Element] +getTextProps = do + props <- asks envTextProperties + return $ if null props + then [] + else [mknode "w:rPr" [] props] + +withTextProp :: PandocMonad m => Element -> WS m a -> WS m a +withTextProp d p = + local (\env -> env {envTextProperties = d : envTextProperties env}) p + +withTextPropM :: PandocMonad m => WS m Element -> WS m a -> WS m a +withTextPropM = (. flip withTextProp) . (>>=) + +getParaProps :: PandocMonad m => Bool -> WS m [Element] +getParaProps displayMathPara = do + props <- asks envParaProperties + listLevel <- asks envListLevel + numid <- asks envListNumId + let listPr = if listLevel >= 0 && not displayMathPara + then [ mknode "w:numPr" [] + [ mknode "w:numId" [("w:val",show numid)] () + , mknode "w:ilvl" [("w:val",show listLevel)] () ] + ] + else [] + return $ case props ++ listPr of + [] -> [] + ps -> [mknode "w:pPr" [] ps] + +withParaProp :: PandocMonad m => Element -> WS m a -> WS m a +withParaProp d p = + local (\env -> env {envParaProperties = d : envParaProperties env}) p + +withParaPropM :: PandocMonad m => WS m Element -> WS m a -> WS m a +withParaPropM = (. flip withParaProp) . (>>=) + +formattedString :: PandocMonad m => String -> WS m [Element] +formattedString str = do + props <- getTextProps + inDel <- asks envInDel + return [ mknode "w:r" [] $ + props ++ + [ mknode (if inDel then "w:delText" else "w:t") + [("xml:space","preserve")] (stripInvalidChars str) ] ] + +setFirstPara :: PandocMonad m => WS m () +setFirstPara = modify $ \s -> s { stFirstPara = True } + +-- | Convert an inline element to OpenXML. +inlineToOpenXML :: PandocMonad m => WriterOptions -> Inline -> WS m [Element] +inlineToOpenXML opts il = withDirection $ inlineToOpenXML' opts il + +inlineToOpenXML' :: PandocMonad m => WriterOptions -> Inline -> WS m [Element] +inlineToOpenXML' _ (Str str) = formattedString str +inlineToOpenXML' opts Space = inlineToOpenXML opts (Str " ") +inlineToOpenXML' opts SoftBreak = inlineToOpenXML opts (Str " ") +inlineToOpenXML' opts (Span (ident,classes,kvs) ils) + | Just sty <- lookup dynamicStyleKey kvs = do + let kvs' = filter ((dynamicStyleKey, sty)/=) kvs + modify $ \s -> s{stDynamicTextProps = sty : (stDynamicTextProps s)} + withTextProp (rCustomStyle sty) $ + inlineToOpenXML opts (Span (ident,classes,kvs') ils) + | Just "rtl" <- lookup "dir" kvs = do + let kvs' = filter (("dir", "rtl")/=) kvs + local (\env -> env { envRTL = True }) $ + inlineToOpenXML opts (Span (ident,classes,kvs') ils) + | Just "ltr" <- lookup "dir" kvs = do + let kvs' = filter (("dir", "ltr")/=) kvs + local (\env -> env { envRTL = False }) $ + inlineToOpenXML opts (Span (ident,classes,kvs') ils) + | "insertion" `elem` classes = do + defaultAuthor <- asks envChangesAuthor + defaultDate <- asks envChangesDate + let author = fromMaybe defaultAuthor (lookup "author" kvs) + date = fromMaybe defaultDate (lookup "date" kvs) + insId <- gets stInsId + modify $ \s -> s{stInsId = (insId + 1)} + x <- inlinesToOpenXML opts ils + return [ mknode "w:ins" [("w:id", (show insId)), + ("w:author", author), + ("w:date", date)] + x ] + | "deletion" `elem` classes = do + defaultAuthor <- asks envChangesAuthor + defaultDate <- asks envChangesDate + let author = fromMaybe defaultAuthor (lookup "author" kvs) + date = fromMaybe defaultDate (lookup "date" kvs) + delId <- gets stDelId + modify $ \s -> s{stDelId = (delId + 1)} + x <- local (\env -> env {envInDel = True}) (inlinesToOpenXML opts ils) + return [ mknode "w:del" [("w:id", (show delId)), + ("w:author", author), + ("w:date", date)] + x ] + | otherwise = do + let off x = withTextProp (mknode x [("w:val","0")] ()) + ((if "csl-no-emph" `elem` classes then off "w:i" else id) . + (if "csl-no-strong" `elem` classes then off "w:b" else id) . + (if "csl-no-smallcaps" `elem` classes then off "w:smallCaps" else id)) + $ inlinesToOpenXML opts ils +inlineToOpenXML' opts (Strong lst) = + withTextProp (mknode "w:b" [] ()) $ inlinesToOpenXML opts lst +inlineToOpenXML' opts (Emph lst) = + withTextProp (mknode "w:i" [] ()) $ inlinesToOpenXML opts lst +inlineToOpenXML' opts (Subscript lst) = + withTextProp (mknode "w:vertAlign" [("w:val","subscript")] ()) + $ inlinesToOpenXML opts lst +inlineToOpenXML' opts (Superscript lst) = + withTextProp (mknode "w:vertAlign" [("w:val","superscript")] ()) + $ inlinesToOpenXML opts lst +inlineToOpenXML' opts (SmallCaps lst) = + withTextProp (mknode "w:smallCaps" [] ()) + $ inlinesToOpenXML opts lst +inlineToOpenXML' opts (Strikeout lst) = + withTextProp (mknode "w:strike" [] ()) + $ inlinesToOpenXML opts lst +inlineToOpenXML' _ LineBreak = return [br] +inlineToOpenXML' _ il@(RawInline f str) + | f == Format "openxml" = return [ x | Elem x <- parseXML str ] + | otherwise = do + report $ InlineNotRendered il + return [] +inlineToOpenXML' opts (Quoted quoteType lst) = + inlinesToOpenXML opts $ [Str open] ++ lst ++ [Str close] + where (open, close) = case quoteType of + SingleQuote -> ("\x2018", "\x2019") + DoubleQuote -> ("\x201C", "\x201D") +inlineToOpenXML' opts (Math mathType str) = do + when (mathType == DisplayMath) setFirstPara + res <- (lift . lift) (convertMath writeOMML mathType str) + case res of + Right r -> return [r] + Left il -> inlineToOpenXML' opts il +inlineToOpenXML' opts (Cite _ lst) = inlinesToOpenXML opts lst +inlineToOpenXML' opts (Code attrs str) = do + let unhighlighted = intercalate [br] `fmap` + (mapM formattedString $ lines str) + formatOpenXML _fmtOpts = intercalate [br] . map (map toHlTok) + toHlTok (toktype,tok) = mknode "w:r" [] + [ mknode "w:rPr" [] + [ rCustomStyle (show toktype) ] + , mknode "w:t" [("xml:space","preserve")] (T.unpack tok) ] + withTextProp (rCustomStyle "VerbatimChar") + $ case writerHighlightStyle opts >> highlight formatOpenXML attrs str of + Just h -> return h + Nothing -> unhighlighted +inlineToOpenXML' opts (Note bs) = do + notes <- gets stFootnotes + notenum <- (lift . lift) getUniqueId + footnoteStyle <- rStyleM "Footnote Reference" + let notemarker = mknode "w:r" [] + [ mknode "w:rPr" [] footnoteStyle + , mknode "w:footnoteRef" [] () ] + let notemarkerXml = RawInline (Format "openxml") $ ppElement notemarker + let insertNoteRef (Plain ils : xs) = Plain (notemarkerXml : Space : ils) : xs + insertNoteRef (Para ils : xs) = Para (notemarkerXml : Space : ils) : xs + insertNoteRef xs = Para [notemarkerXml] : xs + + contents <- local (\env -> env{ envListLevel = -1 + , envParaProperties = [] + , envTextProperties = [] }) + (withParaPropM (pStyleM "Footnote Text") $ blocksToOpenXML opts + $ insertNoteRef bs) + let newnote = mknode "w:footnote" [("w:id", notenum)] $ contents + modify $ \s -> s{ stFootnotes = newnote : notes } + return [ mknode "w:r" [] + [ mknode "w:rPr" [] footnoteStyle + , mknode "w:footnoteReference" [("w:id", notenum)] () ] ] +-- internal link: +inlineToOpenXML' opts (Link _ txt ('#':xs,_)) = do + contents <- withTextPropM (rStyleM "Hyperlink") $ inlinesToOpenXML opts txt + return [ mknode "w:hyperlink" [("w:anchor",xs)] contents ] +-- external link: +inlineToOpenXML' opts (Link _ txt (src,_)) = do + contents <- withTextPropM (rStyleM "Hyperlink") $ inlinesToOpenXML opts txt + extlinks <- gets stExternalLinks + id' <- case M.lookup src extlinks of + Just i -> return i + Nothing -> do + i <- ("rId"++) `fmap` ((lift . lift) getUniqueId) + modify $ \st -> st{ stExternalLinks = + M.insert src i extlinks } + return i + return [ mknode "w:hyperlink" [("r:id",id')] contents ] +inlineToOpenXML' opts (Image attr alt (src, title)) = do + -- first, check to see if we've already done this image + pageWidth <- asks envPrintWidth + imgs <- gets stImages + case M.lookup src imgs of + Just (_,_,_,elt,_) -> return [elt] + Nothing -> do + res <- runExceptT $ lift (P.fetchItem (writerSourceURL opts) src) + case res of + Left (_ :: PandocError) -> do + report $ CouldNotFetchResource src "" + -- emit alt text + inlinesToOpenXML opts alt + Right (img, mt) -> do + ident <- ("rId"++) `fmap` ((lift . lift) getUniqueId) + let (xpt,ypt) = desiredSizeInPoints opts attr + (either (const def) id (imageSize img)) + -- 12700 emu = 1 pt + let (xemu,yemu) = fitToPage (xpt * 12700, ypt * 12700) (pageWidth * 12700) + let cNvPicPr = mknode "pic:cNvPicPr" [] $ + mknode "a:picLocks" [("noChangeArrowheads","1"),("noChangeAspect","1")] () + let nvPicPr = mknode "pic:nvPicPr" [] + [ mknode "pic:cNvPr" + [("descr",src),("id","0"),("name","Picture")] () + , cNvPicPr ] + let blipFill = mknode "pic:blipFill" [] + [ mknode "a:blip" [("r:embed",ident)] () + , mknode "a:stretch" [] $ mknode "a:fillRect" [] () ] + let xfrm = mknode "a:xfrm" [] + [ mknode "a:off" [("x","0"),("y","0")] () + , mknode "a:ext" [("cx",show xemu),("cy",show yemu)] () ] + let prstGeom = mknode "a:prstGeom" [("prst","rect")] $ + mknode "a:avLst" [] () + let ln = mknode "a:ln" [("w","9525")] + [ mknode "a:noFill" [] () + , mknode "a:headEnd" [] () + , mknode "a:tailEnd" [] () ] + let spPr = mknode "pic:spPr" [("bwMode","auto")] + [xfrm, prstGeom, mknode "a:noFill" [] (), ln] + let graphic = mknode "a:graphic" [] $ + mknode "a:graphicData" [("uri","http://schemas.openxmlformats.org/drawingml/2006/picture")] + [ mknode "pic:pic" [] + [ nvPicPr + , blipFill + , spPr ] ] + let imgElt = mknode "w:r" [] $ + mknode "w:drawing" [] $ + mknode "wp:inline" [] + [ mknode "wp:extent" [("cx",show xemu),("cy",show yemu)] () + , mknode "wp:effectExtent" [("b","0"),("l","0"),("r","0"),("t","0")] () + , mknode "wp:docPr" [("descr",stringify alt), ("title", title), ("id","1"),("name","Picture")] () + , graphic ] + let imgext = case mt >>= extensionFromMimeType of + Just x -> '.':x + Nothing -> case imageType img of + Just Png -> ".png" + Just Jpeg -> ".jpeg" + Just Gif -> ".gif" + Just Pdf -> ".pdf" + Just Eps -> ".eps" + Nothing -> "" + if null imgext + then -- without an extension there is no rule for content type + inlinesToOpenXML opts alt -- return alt to avoid corrupted docx + else do + let imgpath = "media/" ++ ident ++ imgext + let mbMimeType = mt <|> getMimeType imgpath + -- insert mime type to use in constructing [Content_Types].xml + modify $ \st -> st{ stImages = + M.insert src (ident, imgpath, mbMimeType, imgElt, img) + $ stImages st } + return [imgElt] + +br :: Element +br = breakElement "textWrapping" + +breakElement :: String -> Element +breakElement kind = mknode "w:r" [] [mknode "w:br" [("w:type",kind)] () ] + +-- Word will insert these footnotes into the settings.xml file +-- (whether or not they're visible in the document). If they're in the +-- file, but not in the footnotes.xml file, it will produce +-- problems. So we want to make sure we insert them into our document. +defaultFootnotes :: [Element] +defaultFootnotes = [ mknode "w:footnote" + [("w:type", "separator"), ("w:id", "-1")] $ + [ mknode "w:p" [] $ + [mknode "w:r" [] $ + [ mknode "w:separator" [] ()]]] + , mknode "w:footnote" + [("w:type", "continuationSeparator"), ("w:id", "0")] $ + [ mknode "w:p" [] $ + [ mknode "w:r" [] $ + [ mknode "w:continuationSeparator" [] ()]]]] + +parseXml :: (PandocMonad m) => Archive -> Archive -> String -> m Element +parseXml refArchive distArchive relpath = + case findEntryByPath relpath refArchive `mplus` + findEntryByPath relpath distArchive of + Nothing -> fail $ relpath ++ " missing in reference docx" + Just e -> case parseXMLDoc . UTF8.toStringLazy . fromEntry $ e of + Nothing -> fail $ relpath ++ " corrupt in reference docx" + Just d -> return d + +-- | Scales the image to fit the page +-- sizes are passed in emu +fitToPage :: (Double, Double) -> Integer -> (Integer, Integer) +fitToPage (x, y) pageWidth + -- Fixes width to the page width and scales the height + | x > fromIntegral pageWidth = + (pageWidth, floor $ ((fromIntegral pageWidth) / x) * y) + | otherwise = (floor x, floor y) + +withDirection :: PandocMonad m => WS m a -> WS m a +withDirection x = do + isRTL <- asks envRTL + paraProps <- asks envParaProperties + textProps <- asks envTextProperties + -- We want to clean all bidirection (bidi) and right-to-left (rtl) + -- properties from the props first. This is because we don't want + -- them to stack up. + let paraProps' = filter (\e -> (qName . elName) e /= "bidi") paraProps + textProps' = filter (\e -> (qName . elName) e /= "rtl") textProps + if isRTL + -- if we are going right-to-left, we (re?)add the properties. + then flip local x $ + \env -> env { envParaProperties = (mknode "w:bidi" [] ()) : paraProps' + , envTextProperties = (mknode "w:rtl" [] ()) : textProps' + } + else flip local x $ \env -> env { envParaProperties = paraProps' + , envTextProperties = textProps' + } diff --git a/src/Text/Pandoc/Writers/DokuWiki.hs b/src/Text/Pandoc/Writers/DokuWiki.hs new file mode 100644 index 000000000..79a371d4d --- /dev/null +++ b/src/Text/Pandoc/Writers/DokuWiki.hs @@ -0,0 +1,522 @@ +{- +Copyright (C) 2008-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.DokuWiki + Copyright : Copyright (C) 2008-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : Clare Macrae <clare.macrae@googlemail.com> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to DokuWiki markup. + +DokuWiki: <https://www.dokuwiki.org/dokuwiki> +-} + +{- + [ ] Implement nested blockquotes (currently only ever does one level) + [x] Implement alignment of text in tables + [ ] Implement comments + [ ] Work through the Dokuwiki spec, and check I've not missed anything out + [ ] Remove dud/duplicate code +-} + +module Text.Pandoc.Writers.DokuWiki ( writeDokuWiki ) where +import Text.Pandoc.Definition +import Text.Pandoc.Options ( WriterOptions( + writerTableOfContents + , writerTemplate + , writerWrapText), WrapOption(..) ) +import Text.Pandoc.Shared ( escapeURI, linesToPara, removeFormatting + , camelCaseToHyphenated, trimr, substitute ) +import Text.Pandoc.Writers.Shared ( defField, metaToJSON ) +import Text.Pandoc.ImageSize +import Text.Pandoc.Templates ( renderTemplate' ) +import Data.List ( intersect, intercalate, isPrefixOf, transpose ) +import Data.Default (Default(..)) +import Network.URI ( isURI ) +import Control.Monad ( zipWithM ) +import Control.Monad.State ( modify, State, get, evalState ) +import Control.Monad.Reader ( ReaderT, runReaderT, ask, local ) +import Text.Pandoc.Class (PandocMonad) + +data WriterState = WriterState { + stNotes :: Bool -- True if there are notes + } + +data WriterEnvironment = WriterEnvironment { + stIndent :: String -- Indent after the marker at the beginning of list items + , stUseTags :: Bool -- True if we should use HTML tags because we're in a complex list + , stBackSlashLB :: Bool -- True if we should produce formatted strings with newlines (as in a table cell) + } + +instance Default WriterState where + def = WriterState { stNotes = False } + +instance Default WriterEnvironment where + def = WriterEnvironment { stIndent = "" + , stUseTags = False + , stBackSlashLB = False } + +type DokuWiki = ReaderT WriterEnvironment (State WriterState) + +-- | Convert Pandoc to DokuWiki. +writeDokuWiki :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeDokuWiki opts document = return $ + runDokuWiki (pandocToDokuWiki opts document) + +runDokuWiki :: DokuWiki a -> a +runDokuWiki = flip evalState def . flip runReaderT def + +-- | Return DokuWiki representation of document. +pandocToDokuWiki :: WriterOptions -> Pandoc -> DokuWiki String +pandocToDokuWiki opts (Pandoc meta blocks) = do + metadata <- metaToJSON opts + (fmap trimr . blockListToDokuWiki opts) + (inlineListToDokuWiki opts) + meta + body <- blockListToDokuWiki opts blocks + notesExist <- stNotes <$> get + let notes = if notesExist + then "" -- TODO Was "\n<references />" Check whether I can really remove this: + -- if it is definitely to do with footnotes, can remove this whole bit + else "" + let main = body ++ notes + let context = defField "body" main + $ defField "toc" (writerTableOfContents opts) + $ metadata + case writerTemplate opts of + Nothing -> return main + Just tpl -> return $ renderTemplate' tpl context + +-- | Escape special characters for DokuWiki. +escapeString :: String -> String +escapeString = substitute "__" "%%__%%" . + substitute "**" "%%**%%" . + substitute "//" "%%//%%" + +-- | Convert Pandoc block element to DokuWiki. +blockToDokuWiki :: WriterOptions -- ^ Options + -> Block -- ^ Block element + -> DokuWiki String + +blockToDokuWiki _ Null = return "" + +blockToDokuWiki opts (Div _attrs bs) = do + contents <- blockListToDokuWiki opts bs + return $ contents ++ "\n" + +blockToDokuWiki opts (Plain inlines) = + inlineListToDokuWiki opts inlines + +-- title beginning with fig: indicates that the image is a figure +-- dokuwiki doesn't support captions - so combine together alt and caption into alt +blockToDokuWiki opts (Para [Image attr txt (src,'f':'i':'g':':':tit)]) = do + capt <- if null txt + then return "" + else (" " ++) `fmap` inlineListToDokuWiki opts txt + let opt = if null txt + then "" + else "|" ++ if null tit then capt else tit ++ capt + -- Relative links fail isURI and receive a colon + prefix = if isURI src then "" else ":" + return $ "{{" ++ prefix ++ src ++ imageDims opts attr ++ opt ++ "}}\n" + +blockToDokuWiki opts (Para inlines) = do + indent <- stIndent <$> ask + useTags <- stUseTags <$> ask + contents <- inlineListToDokuWiki opts inlines + return $ if useTags + then "<HTML><p></HTML>" ++ contents ++ "<HTML></p></HTML>" + else contents ++ if null indent then "\n" else "" + +blockToDokuWiki opts (LineBlock lns) = + blockToDokuWiki opts $ linesToPara lns + +blockToDokuWiki _ (RawBlock f str) + | f == Format "dokuwiki" = return str + -- See https://www.dokuwiki.org/wiki:syntax + -- use uppercase HTML tag for block-level content: + | f == Format "html" = return $ "<HTML>\n" ++ str ++ "\n</HTML>" + | otherwise = return "" + +blockToDokuWiki _ HorizontalRule = return "\n----\n" + +blockToDokuWiki opts (Header level _ inlines) = do + -- emphasis, links etc. not allowed in headers, apparently, + -- so we remove formatting: + contents <- inlineListToDokuWiki opts $ removeFormatting inlines + let eqs = replicate ( 7 - level ) '=' + return $ eqs ++ " " ++ contents ++ " " ++ eqs ++ "\n" + +blockToDokuWiki _ (CodeBlock (_,classes,_) str) = do + let at = classes `intersect` ["actionscript", "ada", "apache", "applescript", "asm", "asp", + "autoit", "bash", "blitzbasic", "bnf", "c", "c_mac", "caddcl", "cadlisp", "cfdg", "cfm", + "cpp", "cpp-qt", "csharp", "css", "d", "delphi", "diff", "div", "dos", "eiffel", "fortran", + "freebasic", "gml", "groovy", "html4strict", "idl", "ini", "inno", "io", "java", "java5", + "javascript", "latex", "lisp", "lua", "matlab", "mirc", "mpasm", "mysql", "nsis", "objc", + "ocaml", "ocaml-brief", "oobas", "oracle8", "pascal", "perl", "php", "php-brief", "plsql", + "python", "qbasic", "rails", "reg", "robots", "ruby", "sas", "scheme", "sdlbasic", + "smalltalk", "smarty", "sql", "tcl", "", "thinbasic", "tsql", "vb", "vbnet", "vhdl", + "visualfoxpro", "winbatch", "xml", "xpp", "z80"] + return $ "<code" ++ + (case at of + [] -> ">\n" + (x:_) -> " " ++ x ++ ">\n") ++ str ++ "\n</code>" + +blockToDokuWiki opts (BlockQuote blocks) = do + contents <- blockListToDokuWiki opts blocks + if isSimpleBlockQuote blocks + then return $ unlines $ map ("> " ++) $ lines contents + else return $ "<HTML><blockquote>\n" ++ contents ++ "</blockquote></HTML>" + +blockToDokuWiki opts (Table capt aligns _ headers rows) = do + captionDoc <- if null capt + then return "" + else do + c <- inlineListToDokuWiki opts capt + return $ "" ++ c ++ "\n" + headers' <- if all null headers + then return [] + else zipWithM (tableItemToDokuWiki opts) aligns headers + rows' <- mapM (zipWithM (tableItemToDokuWiki opts) aligns) rows + let widths = map (maximum . map length) $ transpose (headers':rows') + let padTo (width, al) s = + case (width - length s) of + x | x > 0 -> + if al == AlignLeft || al == AlignDefault + then s ++ replicate x ' ' + else if al == AlignRight + then replicate x ' ' ++ s + else replicate (x `div` 2) ' ' ++ + s ++ replicate (x - x `div` 2) ' ' + | otherwise -> s + let renderRow sep cells = sep ++ + intercalate sep (zipWith padTo (zip widths aligns) cells) ++ sep + return $ captionDoc ++ + (if null headers' then "" else renderRow "^" headers' ++ "\n") ++ + unlines (map (renderRow "|") rows') + +blockToDokuWiki opts x@(BulletList items) = do + oldUseTags <- stUseTags <$> ask + indent <- stIndent <$> ask + backSlash <- stBackSlashLB <$> ask + let useTags = oldUseTags || not (isSimpleList x) + if useTags + then do + contents <- local (\s -> s { stUseTags = True }) + (mapM (listItemToDokuWiki opts) items) + return $ "<HTML><ul></HTML>\n" ++ vcat contents ++ "<HTML></ul></HTML>\n" + else do + contents <- local (\s -> s { stIndent = stIndent s ++ " " + , stBackSlashLB = backSlash}) + (mapM (listItemToDokuWiki opts) items) + return $ vcat contents ++ if null indent then "\n" else "" + +blockToDokuWiki opts x@(OrderedList attribs items) = do + oldUseTags <- stUseTags <$> ask + indent <- stIndent <$> ask + backSlash <- stBackSlashLB <$> ask + let useTags = oldUseTags || not (isSimpleList x) + if useTags + then do + contents <- local (\s -> s { stUseTags = True }) + (mapM (orderedListItemToDokuWiki opts) items) + return $ "<HTML><ol" ++ listAttribsToString attribs ++ "></HTML>\n" ++ vcat contents ++ "<HTML></ol></HTML>\n" + else do + contents <- local (\s -> s { stIndent = stIndent s ++ " " + , stBackSlashLB = backSlash}) + (mapM (orderedListItemToDokuWiki opts) items) + return $ vcat contents ++ if null indent then "\n" else "" + +-- TODO Need to decide how to make definition lists work on dokuwiki - I don't think there +-- is a specific representation of them. +-- TODO This creates double '; ; ' if there is a bullet or ordered list inside a definition list +blockToDokuWiki opts x@(DefinitionList items) = do + oldUseTags <- stUseTags <$> ask + indent <- stIndent <$> ask + backSlash <- stBackSlashLB <$> ask + let useTags = oldUseTags || not (isSimpleList x) + if useTags + then do + contents <- local (\s -> s { stUseTags = True }) + (mapM (definitionListItemToDokuWiki opts) items) + return $ "<HTML><dl></HTML>\n" ++ vcat contents ++ "<HTML></dl></HTML>\n" + else do + contents <- local (\s -> s { stIndent = stIndent s ++ " " + , stBackSlashLB = backSlash}) + (mapM (definitionListItemToDokuWiki opts) items) + return $ vcat contents ++ if null indent then "\n" else "" + +-- Auxiliary functions for lists: + +-- | Convert ordered list attributes to HTML attribute string +listAttribsToString :: ListAttributes -> String +listAttribsToString (startnum, numstyle, _) = + let numstyle' = camelCaseToHyphenated $ show numstyle + in (if startnum /= 1 + then " start=\"" ++ show startnum ++ "\"" + else "") ++ + (if numstyle /= DefaultStyle + then " style=\"list-style-type: " ++ numstyle' ++ ";\"" + else "") + +-- | Convert bullet list item (list of blocks) to DokuWiki. +listItemToDokuWiki :: WriterOptions -> [Block] -> DokuWiki String +listItemToDokuWiki opts items = do + contents <- blockListToDokuWiki opts items + useTags <- stUseTags <$> ask + if useTags + then return $ "<HTML><li></HTML>" ++ contents ++ "<HTML></li></HTML>" + else do + indent <- stIndent <$> ask + backSlash <- stBackSlashLB <$> ask + let indent' = if backSlash then (drop 2 indent) else indent + return $ indent' ++ "* " ++ contents + +-- | Convert ordered list item (list of blocks) to DokuWiki. +-- | TODO Emiminate dreadful duplication of text from listItemToDokuWiki +orderedListItemToDokuWiki :: WriterOptions -> [Block] -> DokuWiki String +orderedListItemToDokuWiki opts items = do + contents <- blockListToDokuWiki opts items + useTags <- stUseTags <$> ask + if useTags + then return $ "<HTML><li></HTML>" ++ contents ++ "<HTML></li></HTML>" + else do + indent <- stIndent <$> ask + backSlash <- stBackSlashLB <$> ask + let indent' = if backSlash then (drop 2 indent) else indent + return $ indent' ++ "- " ++ contents + +-- | Convert definition list item (label, list of blocks) to DokuWiki. +definitionListItemToDokuWiki :: WriterOptions + -> ([Inline],[[Block]]) + -> DokuWiki String +definitionListItemToDokuWiki opts (label, items) = do + labelText <- inlineListToDokuWiki opts label + contents <- mapM (blockListToDokuWiki opts) items + useTags <- stUseTags <$> ask + if useTags + then return $ "<HTML><dt></HTML>" ++ labelText ++ "<HTML></dt></HTML>\n" ++ + (intercalate "\n" $ map (\d -> "<HTML><dd></HTML>" ++ d ++ "<HTML></dd></HTML>") contents) + else do + indent <- stIndent <$> ask + backSlash <- stBackSlashLB <$> ask + let indent' = if backSlash then (drop 2 indent) else indent + return $ indent' ++ "* **" ++ labelText ++ "** " ++ concat contents + +-- | True if the list can be handled by simple wiki markup, False if HTML tags will be needed. +isSimpleList :: Block -> Bool +isSimpleList x = + case x of + BulletList items -> all isSimpleListItem items + OrderedList (num, sty, _) items -> all isSimpleListItem items && + num == 1 && sty `elem` [DefaultStyle, Decimal] + DefinitionList items -> all isSimpleListItem $ concatMap snd items + _ -> False + +-- | True if list item can be handled with the simple wiki syntax. False if +-- HTML tags will be needed. +isSimpleListItem :: [Block] -> Bool +isSimpleListItem [] = True +isSimpleListItem [x] = + case x of + Plain _ -> True + Para _ -> True + BulletList _ -> isSimpleList x + OrderedList _ _ -> isSimpleList x + DefinitionList _ -> isSimpleList x + _ -> False +isSimpleListItem [x, y] | isPlainOrPara x = + case y of + BulletList _ -> isSimpleList y + OrderedList _ _ -> isSimpleList y + DefinitionList _ -> isSimpleList y + _ -> False +isSimpleListItem _ = False + +isPlainOrPara :: Block -> Bool +isPlainOrPara (Plain _) = True +isPlainOrPara (Para _) = True +isPlainOrPara _ = False + +isSimpleBlockQuote :: [Block] -> Bool +isSimpleBlockQuote bs = all isPlainOrPara bs + +-- | Concatenates strings with line breaks between them. +vcat :: [String] -> String +vcat = intercalate "\n" + +backSlashLineBreaks :: String -> String +backSlashLineBreaks cs = reverse $ g $ reverse $ concatMap f cs + where f '\n' = "\\\\ " + f c = [c] + g (' ' : '\\':'\\': xs) = xs + g s = s + +-- Auxiliary functions for tables: + +tableItemToDokuWiki :: WriterOptions + -> Alignment + -> [Block] + -> DokuWiki String +tableItemToDokuWiki opts align' item = do + let mkcell x = (if align' == AlignRight || align' == AlignCenter + then " " + else "") ++ x ++ + (if align' == AlignLeft || align' == AlignCenter + then " " + else "") + contents <- local (\s -> s { stBackSlashLB = True }) $ + blockListToDokuWiki opts item + return $ mkcell contents + +-- | Convert list of Pandoc block elements to DokuWiki. +blockListToDokuWiki :: WriterOptions -- ^ Options + -> [Block] -- ^ List of block elements + -> DokuWiki String +blockListToDokuWiki opts blocks = do + backSlash <- stBackSlashLB <$> ask + let blocks' = consolidateRawBlocks blocks + if backSlash + then (backSlashLineBreaks . vcat) <$> mapM (blockToDokuWiki opts) blocks' + else vcat <$> mapM (blockToDokuWiki opts) blocks' + +consolidateRawBlocks :: [Block] -> [Block] +consolidateRawBlocks [] = [] +consolidateRawBlocks (RawBlock f1 b1 : RawBlock f2 b2 : xs) + | f1 == f2 = consolidateRawBlocks (RawBlock f1 (b1 ++ "\n" ++ b2) : xs) +consolidateRawBlocks (x:xs) = x : consolidateRawBlocks xs + +-- | Convert list of Pandoc inline elements to DokuWiki. +inlineListToDokuWiki :: WriterOptions -> [Inline] -> DokuWiki String +inlineListToDokuWiki opts lst = + concat <$> (mapM (inlineToDokuWiki opts) lst) + +-- | Convert Pandoc inline element to DokuWiki. +inlineToDokuWiki :: WriterOptions -> Inline -> DokuWiki String + +inlineToDokuWiki opts (Span _attrs ils) = + inlineListToDokuWiki opts ils + +inlineToDokuWiki opts (Emph lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "//" ++ contents ++ "//" + +inlineToDokuWiki opts (Strong lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "**" ++ contents ++ "**" + +inlineToDokuWiki opts (Strikeout lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "<del>" ++ contents ++ "</del>" + +inlineToDokuWiki opts (Superscript lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "<sup>" ++ contents ++ "</sup>" + +inlineToDokuWiki opts (Subscript lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "<sub>" ++ contents ++ "</sub>" + +inlineToDokuWiki opts (SmallCaps lst) = inlineListToDokuWiki opts lst + +inlineToDokuWiki opts (Quoted SingleQuote lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "\8216" ++ contents ++ "\8217" + +inlineToDokuWiki opts (Quoted DoubleQuote lst) = do + contents <- inlineListToDokuWiki opts lst + return $ "\8220" ++ contents ++ "\8221" + +inlineToDokuWiki opts (Cite _ lst) = inlineListToDokuWiki opts lst + +inlineToDokuWiki _ (Code _ str) = + -- In dokuwiki, text surrounded by '' is really just a font statement, i.e. <tt>, + -- and so other formatting can be present inside. + -- However, in pandoc, and markdown, inlined code doesn't contain formatting. + -- So I have opted for using %% to disable all formatting inside inline code blocks. + -- This gives the best results when converting from other formats to dokuwiki, even if + -- the resultand code is a little ugly, for short strings that don't contain formatting + -- characters. + -- It does mean that if pandoc could ever read dokuwiki, and so round-trip the format, + -- any formatting inside inlined code blocks would be lost, or presented incorrectly. + return $ "''%%" ++ str ++ "%%''" + +inlineToDokuWiki _ (Str str) = return $ escapeString str + +inlineToDokuWiki _ (Math mathType str) = return $ delim ++ str ++ delim + -- note: str should NOT be escaped + where delim = case mathType of + DisplayMath -> "$$" + InlineMath -> "$" + +inlineToDokuWiki _ (RawInline f str) + | f == Format "dokuwiki" = return str + | f == Format "html" = return $ "<html>" ++ str ++ "</html>" + | otherwise = return "" + +inlineToDokuWiki _ LineBreak = return "\\\\\n" + +inlineToDokuWiki opts SoftBreak = + case writerWrapText opts of + WrapNone -> return " " + WrapAuto -> return " " + WrapPreserve -> return "\n" + +inlineToDokuWiki _ Space = return " " + +inlineToDokuWiki opts (Link _ txt (src, _)) = do + label <- inlineListToDokuWiki opts txt + case txt of + [Str s] | "mailto:" `isPrefixOf` src -> return $ "<" ++ s ++ ">" + | escapeURI s == src -> return src + _ -> if isURI src + then return $ "[[" ++ src ++ "|" ++ label ++ "]]" + else return $ "[[" ++ src' ++ "|" ++ label ++ "]]" + where src' = case src of + '/':xs -> xs -- with leading / it's a + _ -> src -- link to a help page +inlineToDokuWiki opts (Image attr alt (source, tit)) = do + alt' <- inlineListToDokuWiki opts alt + let txt = case (tit, alt) of + ("", []) -> "" + ("", _ ) -> "|" ++ alt' + (_ , _ ) -> "|" ++ tit + -- Relative links fail isURI and receive a colon + prefix = if isURI source then "" else ":" + return $ "{{" ++ prefix ++ source ++ imageDims opts attr ++ txt ++ "}}" + +inlineToDokuWiki opts (Note contents) = do + contents' <- blockListToDokuWiki opts contents + modify (\s -> s { stNotes = True }) + return $ "((" ++ contents' ++ "))" + -- note - may not work for notes with multiple blocks + +imageDims :: WriterOptions -> Attr -> String +imageDims opts attr = go (toPx $ dimension Width attr) (toPx $ dimension Height attr) + where + toPx = fmap (showInPixel opts) . checkPct + checkPct (Just (Percent _)) = Nothing + checkPct maybeDim = maybeDim + go (Just w) Nothing = "?" ++ w + go (Just w) (Just h) = "?" ++ w ++ "x" ++ h + go Nothing (Just h) = "?0x" ++ h + go Nothing Nothing = "" diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs new file mode 100644 index 000000000..247014c20 --- /dev/null +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -0,0 +1,1257 @@ +{-# LANGUAGE PatternGuards, CPP, ScopedTypeVariables, ViewPatterns, FlexibleContexts #-} +{- +Copyright (C) 2010-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.EPUB + Copyright : Copyright (C) 2010-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to EPUB. +-} +module Text.Pandoc.Writers.EPUB ( writeEPUB2, writeEPUB3 ) where +import Text.Pandoc.Logging +import qualified Data.Map as M +import qualified Data.Set as Set +import Data.Maybe ( fromMaybe, catMaybes ) +import Data.List ( isPrefixOf, isInfixOf, intercalate ) +import Text.Printf (printf) +import System.FilePath ( takeExtension, takeFileName ) +import Network.HTTP ( urlEncode ) +import qualified Data.ByteString.Lazy as B +import qualified Data.ByteString.Lazy.Char8 as B8 +import qualified Text.Pandoc.UTF8 as UTF8 +import Codec.Archive.Zip ( emptyArchive, addEntryToArchive, eRelativePath, fromEntry , Entry, toEntry, fromArchive) +import Text.Pandoc.Compat.Time +import Text.Pandoc.Shared ( renderTags', safeRead, uniqueIdent, trim + , normalizeDate, stringify + , hierarchicalize ) +import qualified Text.Pandoc.Shared as S (Element(..)) +import Text.Pandoc.Builder (fromList, setMeta) +import Text.Pandoc.Options ( WriterOptions(..) + , WrapOption(..) + , HTMLMathMethod(..) + , EPUBVersion(..) + , ObfuscationMethod(NoObfuscation) ) +import Text.Pandoc.Definition +import Text.Pandoc.Walk (walk, walkM, query) +import Text.Pandoc.UUID (getUUID) +import Control.Monad.State (modify, get, gets, State, StateT, put, evalState, evalStateT, lift) +import Control.Monad (mplus, when, zipWithM) +import Text.XML.Light ( unode, Element(..), unqual, Attr(..), add_attrs + , strContent, lookupAttr, Node(..), QName(..), parseXML + , onlyElems, node, ppElement) +import Text.Pandoc.Writers.HTML ( writeHtmlStringForEPUB ) +import Data.Char ( toLower, isDigit, isAlphaNum ) +import Text.Pandoc.MIME (MimeType, getMimeType, extensionFromMimeType) +import Text.HTML.TagSoup (Tag(TagOpen), fromAttrib, parseTags) +import Control.Monad.Except (throwError, catchError) +import Text.Pandoc.Error +import Text.Pandoc.Class (PandocMonad, report) +import qualified Text.Pandoc.Class as P + +-- A Chapter includes a list of blocks and maybe a section +-- number offset. Note, some chapters are unnumbered. The section +-- number is different from the index number, which will be used +-- in filenames, chapter0003.xhtml. +data Chapter = Chapter (Maybe [Int]) [Block] + +data EPUBState = EPUBState { + stMediaPaths :: [(FilePath, (FilePath, Maybe Entry))] + } + +type E m = StateT EPUBState m + +data EPUBMetadata = EPUBMetadata{ + epubIdentifier :: [Identifier] + , epubTitle :: [Title] + , epubDate :: [Date] + , epubLanguage :: String + , epubCreator :: [Creator] + , epubContributor :: [Creator] + , epubSubject :: [String] + , epubDescription :: Maybe String + , epubType :: Maybe String + , epubFormat :: Maybe String + , epubPublisher :: Maybe String + , epubSource :: Maybe String + , epubRelation :: Maybe String + , epubCoverage :: Maybe String + , epubRights :: Maybe String + , epubCoverImage :: Maybe String + , epubStylesheet :: Maybe Stylesheet + , epubPageDirection :: Maybe ProgressionDirection + } deriving Show + +data Stylesheet = StylesheetPath FilePath + | StylesheetContents String + deriving Show + +data Date = Date{ + dateText :: String + , dateEvent :: Maybe String + } deriving Show + +data Creator = Creator{ + creatorText :: String + , creatorRole :: Maybe String + , creatorFileAs :: Maybe String + } deriving Show + +data Identifier = Identifier{ + identifierText :: String + , identifierScheme :: Maybe String + } deriving Show + +data Title = Title{ + titleText :: String + , titleFileAs :: Maybe String + , titleType :: Maybe String + } deriving Show + +data ProgressionDirection = LTR | RTL deriving Show + +dcName :: String -> QName +dcName n = QName n Nothing (Just "dc") + +dcNode :: Node t => String -> t -> Element +dcNode = node . dcName + +opfName :: String -> QName +opfName n = QName n Nothing (Just "opf") + +toId :: FilePath -> String +toId = map (\x -> if isAlphaNum x || x == '-' || x == '_' + then x + else '_') . takeFileName + +removeNote :: Inline -> Inline +removeNote (Note _) = Str "" +removeNote x = x + +getEPUBMetadata :: PandocMonad m => WriterOptions -> Meta -> E m EPUBMetadata +getEPUBMetadata opts meta = do + let md = metadataFromMeta opts meta + let elts = maybe [] (onlyElems . parseXML) $ writerEpubMetadata opts + let md' = foldr addMetadataFromXML md elts + let addIdentifier m = + if null (epubIdentifier m) + then do + randomId <- (show . getUUID) <$> lift P.newStdGen + return $ m{ epubIdentifier = [Identifier randomId Nothing] } + else return m + let addLanguage m = + if null (epubLanguage m) + then case lookup "lang" (writerVariables opts) of + Just x -> return m{ epubLanguage = x } + Nothing -> do + mLang <- lift $ P.lookupEnv "LANG" + let localeLang = + case mLang of + Just lang -> + map (\c -> if c == '_' then '-' else c) $ + takeWhile (/='.') lang + Nothing -> "en-US" + return m{ epubLanguage = localeLang } + else return m + let fixDate m = + if null (epubDate m) + then do + currentTime <- lift P.getCurrentTime + return $ m{ epubDate = [ Date{ + dateText = showDateTimeISO8601 currentTime + , dateEvent = Nothing } ] } + else return m + let addAuthor m = + if any (\c -> creatorRole c == Just "aut") $ epubCreator m + then return m + else do + let authors' = map stringify $ docAuthors meta + let toAuthor name = Creator{ creatorText = name + , creatorRole = Just "aut" + , creatorFileAs = Nothing } + return $ m{ epubCreator = map toAuthor authors' ++ epubCreator m } + addIdentifier md' >>= fixDate >>= addAuthor >>= addLanguage + +addMetadataFromXML :: Element -> EPUBMetadata -> EPUBMetadata +addMetadataFromXML e@(Element (QName name _ (Just "dc")) attrs _ _) md + | name == "identifier" = md{ epubIdentifier = + Identifier{ identifierText = strContent e + , identifierScheme = lookupAttr (opfName "scheme") attrs + } : epubIdentifier md } + | name == "title" = md{ epubTitle = + Title{ titleText = strContent e + , titleFileAs = getAttr "file-as" + , titleType = getAttr "type" + } : epubTitle md } + | name == "date" = md{ epubDate = + Date{ dateText = fromMaybe "" $ normalizeDate' $ strContent e + , dateEvent = getAttr "event" + } : epubDate md } + | name == "language" = md{ epubLanguage = strContent e } + | name == "creator" = md{ epubCreator = + Creator{ creatorText = strContent e + , creatorRole = getAttr "role" + , creatorFileAs = getAttr "file-as" + } : epubCreator md } + | name == "contributor" = md{ epubContributor = + Creator { creatorText = strContent e + , creatorRole = getAttr "role" + , creatorFileAs = getAttr "file-as" + } : epubContributor md } + | name == "subject" = md{ epubSubject = strContent e : epubSubject md } + | name == "description" = md { epubDescription = Just $ strContent e } + | name == "type" = md { epubType = Just $ strContent e } + | name == "format" = md { epubFormat = Just $ strContent e } + | name == "type" = md { epubType = Just $ strContent e } + | name == "publisher" = md { epubPublisher = Just $ strContent e } + | name == "source" = md { epubSource = Just $ strContent e } + | name == "relation" = md { epubRelation = Just $ strContent e } + | name == "coverage" = md { epubCoverage = Just $ strContent e } + | name == "rights" = md { epubRights = Just $ strContent e } + | otherwise = md + where getAttr n = lookupAttr (opfName n) attrs +addMetadataFromXML _ md = md + +metaValueToString :: MetaValue -> String +metaValueToString (MetaString s) = s +metaValueToString (MetaInlines ils) = stringify ils +metaValueToString (MetaBlocks bs) = stringify bs +metaValueToString (MetaBool True) = "true" +metaValueToString (MetaBool False) = "false" +metaValueToString _ = "" + +getList :: String -> Meta -> (MetaValue -> a) -> [a] +getList s meta handleMetaValue = + case lookupMeta s meta of + Just (MetaList xs) -> map handleMetaValue xs + Just mv -> [handleMetaValue mv] + Nothing -> [] + +getIdentifier :: Meta -> [Identifier] +getIdentifier meta = getList "identifier" meta handleMetaValue + where handleMetaValue (MetaMap m) = + Identifier{ identifierText = maybe "" metaValueToString + $ M.lookup "text" m + , identifierScheme = metaValueToString <$> + M.lookup "scheme" m } + handleMetaValue mv = Identifier (metaValueToString mv) Nothing + +getTitle :: Meta -> [Title] +getTitle meta = getList "title" meta handleMetaValue + where handleMetaValue (MetaMap m) = + Title{ titleText = maybe "" metaValueToString $ M.lookup "text" m + , titleFileAs = metaValueToString <$> M.lookup "file-as" m + , titleType = metaValueToString <$> M.lookup "type" m } + handleMetaValue mv = Title (metaValueToString mv) Nothing Nothing + +getCreator :: String -> Meta -> [Creator] +getCreator s meta = getList s meta handleMetaValue + where handleMetaValue (MetaMap m) = + Creator{ creatorText = maybe "" metaValueToString $ M.lookup "text" m + , creatorFileAs = metaValueToString <$> M.lookup "file-as" m + , creatorRole = metaValueToString <$> M.lookup "role" m } + handleMetaValue mv = Creator (metaValueToString mv) Nothing Nothing + +getDate :: String -> Meta -> [Date] +getDate s meta = getList s meta handleMetaValue + where handleMetaValue (MetaMap m) = + Date{ dateText = maybe "" id $ + M.lookup "text" m >>= normalizeDate' . metaValueToString + , dateEvent = metaValueToString <$> M.lookup "event" m } + handleMetaValue mv = Date { dateText = maybe "" + id $ normalizeDate' $ metaValueToString mv + , dateEvent = Nothing } + +simpleList :: String -> Meta -> [String] +simpleList s meta = + case lookupMeta s meta of + Just (MetaList xs) -> map metaValueToString xs + Just x -> [metaValueToString x] + Nothing -> [] + +metadataFromMeta :: WriterOptions -> Meta -> EPUBMetadata +metadataFromMeta opts meta = EPUBMetadata{ + epubIdentifier = identifiers + , epubTitle = titles + , epubDate = date + , epubLanguage = language + , epubCreator = creators + , epubContributor = contributors + , epubSubject = subjects + , epubDescription = description + , epubType = epubtype + , epubFormat = format + , epubPublisher = publisher + , epubSource = source + , epubRelation = relation + , epubCoverage = coverage + , epubRights = rights + , epubCoverImage = coverImage + , epubStylesheet = stylesheet + , epubPageDirection = pageDirection + } + where identifiers = getIdentifier meta + titles = getTitle meta + date = getDate "date" meta + language = maybe "" metaValueToString $ + lookupMeta "language" meta `mplus` lookupMeta "lang" meta + creators = getCreator "creator" meta + contributors = getCreator "contributor" meta + subjects = simpleList "subject" meta + description = metaValueToString <$> lookupMeta "description" meta + epubtype = metaValueToString <$> lookupMeta "type" meta + format = metaValueToString <$> lookupMeta "format" meta + publisher = metaValueToString <$> lookupMeta "publisher" meta + source = metaValueToString <$> lookupMeta "source" meta + relation = metaValueToString <$> lookupMeta "relation" meta + coverage = metaValueToString <$> lookupMeta "coverage" meta + rights = metaValueToString <$> lookupMeta "rights" meta + coverImage = lookup "epub-cover-image" (writerVariables opts) `mplus` + (metaValueToString <$> lookupMeta "cover-image" meta) + stylesheet = (StylesheetContents <$> writerEpubStylesheet opts) `mplus` + ((StylesheetPath . metaValueToString) <$> + lookupMeta "stylesheet" meta) + pageDirection = case map toLower . metaValueToString <$> + lookupMeta "page-progression-direction" meta of + Just "ltr" -> Just LTR + Just "rtl" -> Just RTL + _ -> Nothing + +-- | Produce an EPUB2 file from a Pandoc document. +writeEPUB2 :: PandocMonad m + => WriterOptions -- ^ Writer options + -> Pandoc -- ^ Document to convert + -> m B.ByteString +writeEPUB2 = writeEPUB EPUB2 + +-- | Produce an EPUB3 file from a Pandoc document. +writeEPUB3 :: PandocMonad m + => WriterOptions -- ^ Writer options + -> Pandoc -- ^ Document to convert + -> m B.ByteString +writeEPUB3 = writeEPUB EPUB3 + +-- | Produce an EPUB file from a Pandoc document. +writeEPUB :: PandocMonad m + => EPUBVersion + -> WriterOptions -- ^ Writer options + -> Pandoc -- ^ Document to convert + -> m B.ByteString +writeEPUB epubVersion opts doc = + let initState = EPUBState { stMediaPaths = [] + } + in + evalStateT (pandocToEPUB epubVersion opts doc) + initState + +pandocToEPUB :: PandocMonad m + => EPUBVersion + -> WriterOptions + -> Pandoc + -> E m B.ByteString +pandocToEPUB version opts doc@(Pandoc meta _) = do + let epub3 = version == EPUB3 + let writeHtml o = fmap UTF8.fromStringLazy . + writeHtmlStringForEPUB version o + epochtime <- floor <$> lift P.getPOSIXTime + let mkEntry path content = toEntry path epochtime content + let vars = ("epub3", if epub3 then "true" else "false") + : ("css", "stylesheet.css") + : writerVariables opts + let opts' = opts{ writerEmailObfuscation = NoObfuscation + , writerSectionDivs = True + , writerVariables = vars + , writerHTMLMathMethod = + if epub3 + then MathML + else writerHTMLMathMethod opts + , writerWrapText = WrapAuto } + metadata <- getEPUBMetadata opts' meta + + -- cover page + (cpgEntry, cpicEntry) <- + case epubCoverImage metadata of + Nothing -> return ([],[]) + Just img -> do + let coverImage = "media/" ++ takeFileName img + cpContent <- lift $ writeHtml + opts'{ writerVariables = ("coverpage","true"):vars } + (Pandoc meta [RawBlock (Format "html") $ "<div id=\"cover-image\">\n<img src=\"" ++ coverImage ++ "\" alt=\"cover image\" />\n</div>"]) + imgContent <- lift $ P.readFileLazy img + return ( [mkEntry "cover.xhtml" cpContent] + , [mkEntry coverImage imgContent] ) + + -- title page + tpContent <- lift $ writeHtml opts'{ + writerVariables = ("titlepage","true"):vars } + (Pandoc meta []) + let tpEntry = mkEntry "title_page.xhtml" tpContent + + -- handle pictures + -- mediaRef <- P.newIORef [] + Pandoc _ blocks <- walkM (transformInline opts') doc >>= + walkM (transformBlock opts') + picEntries <- (catMaybes . map (snd . snd)) <$> (gets stMediaPaths) + -- handle fonts + let matchingGlob f = do + xs <- lift $ P.glob f + when (null xs) $ + report $ CouldNotFetchResource f "glob did not match any font files" + return xs + let mkFontEntry f = mkEntry (takeFileName f) `fmap` (lift $ P.readFileLazy f) + fontFiles <- concat <$> mapM matchingGlob (writerEpubFonts opts') + fontEntries <- mapM mkFontEntry fontFiles + + -- set page progression direction attribution + let progressionDirection = case epubPageDirection metadata of + Just LTR | epub3 -> + [("page-progression-direction", "ltr")] + Just RTL | epub3 -> + [("page-progression-direction", "rtl")] + _ -> [] + + -- body pages + + -- add level 1 header to beginning if none there + let blocks' = addIdentifiers + $ case blocks of + (Header 1 _ _ : _) -> blocks + _ -> Header 1 ("",["unnumbered"],[]) + (docTitle' meta) : blocks + + let chapterHeaderLevel = writerEpubChapterLevel opts + + let isChapterHeader (Header n _ _) = n <= chapterHeaderLevel + isChapterHeader (Div ("",["references"],[]) (Header n _ _:_)) = + n <= chapterHeaderLevel + isChapterHeader _ = False + + let toChapters :: [Block] -> State [Int] [Chapter] + toChapters [] = return [] + toChapters (Div ("",["references"],[]) bs@(Header 1 _ _:_) : rest) = + toChapters (bs ++ rest) + toChapters (Header n attr@(_,classes,_) ils : bs) = do + nums <- get + mbnum <- if "unnumbered" `elem` classes + then return Nothing + else case splitAt (n - 1) nums of + (ks, (m:_)) -> do + let nums' = ks ++ [m+1] + put nums' + return $ Just (ks ++ [m]) + -- note, this is the offset not the sec number + (ks, []) -> do + let nums' = ks ++ [1] + put nums' + return $ Just ks + let (xs,ys) = break isChapterHeader bs + (Chapter mbnum (Header n attr ils : xs) :) `fmap` toChapters ys + toChapters (b:bs) = do + let (xs,ys) = break isChapterHeader bs + (Chapter Nothing (b:xs) :) `fmap` toChapters ys + + let chapters' = evalState (toChapters blocks') [] + + let extractLinkURL' :: Int -> Inline -> [(String, String)] + extractLinkURL' num (Span (ident, _, _) _) + | not (null ident) = [(ident, showChapter num ++ ('#':ident))] + extractLinkURL' _ _ = [] + + let extractLinkURL :: Int -> Block -> [(String, String)] + extractLinkURL num (Div (ident, _, _) _) + | not (null ident) = [(ident, showChapter num ++ ('#':ident))] + extractLinkURL num (Header _ (ident, _, _) _) + | not (null ident) = [(ident, showChapter num ++ ('#':ident))] + extractLinkURL num b = query (extractLinkURL' num) b + + let reftable = concat $ zipWith (\(Chapter _ bs) num -> + query (extractLinkURL num) bs) + chapters' [1..] + + let fixInternalReferences :: Inline -> Inline + fixInternalReferences (Link attr lab ('#':xs, tit)) = + case lookup xs reftable of + Just ys -> Link attr lab (ys, tit) + Nothing -> Link attr lab ('#':xs, tit) + fixInternalReferences x = x + + -- internal reference IDs change when we chunk the file, + -- so that '#my-header-1' might turn into 'chap004.xhtml#my-header'. + -- this fixes that: + let chapters = map (\(Chapter mbnum bs) -> + Chapter mbnum $ walk fixInternalReferences bs) + chapters' + + let chapToEntry num (Chapter mbnum bs) = + mkEntry (showChapter num) <$> + (writeHtml opts'{ writerNumberOffset = fromMaybe [] mbnum } + $ case bs of + (Header _ _ xs : _) -> + -- remove notes or we get doubled footnotes + Pandoc (setMeta "title" (walk removeNote $ fromList xs) + nullMeta) bs + _ -> + Pandoc nullMeta bs) + + chapterEntries <- lift $ zipWithM chapToEntry [1..] chapters + + -- incredibly inefficient (TODO): + let containsMathML ent = epub3 && + "<math" `isInfixOf` (B8.unpack $ fromEntry ent) + let containsSVG ent = epub3 && + "<svg" `isInfixOf` (B8.unpack $ fromEntry ent) + let props ent = ["mathml" | containsMathML ent] ++ ["svg" | containsSVG ent] + + -- contents.opf + let chapterNode ent = unode "item" ! + ([("id", toId $ eRelativePath ent), + ("href", eRelativePath ent), + ("media-type", "application/xhtml+xml")] + ++ case props ent of + [] -> [] + xs -> [("properties", unwords xs)]) + $ () + let chapterRefNode ent = unode "itemref" ! + [("idref", toId $ eRelativePath ent)] $ () + let pictureNode ent = unode "item" ! + [("id", toId $ eRelativePath ent), + ("href", eRelativePath ent), + ("media-type", fromMaybe "application/octet-stream" + $ mediaTypeOf $ eRelativePath ent)] $ () + let fontNode ent = unode "item" ! + [("id", toId $ eRelativePath ent), + ("href", eRelativePath ent), + ("media-type", fromMaybe "" $ getMimeType $ eRelativePath ent)] $ () + let plainTitle = case docTitle' meta of + [] -> case epubTitle metadata of + [] -> "UNTITLED" + (x:_) -> titleText x + x -> stringify x + + let tocTitle = fromMaybe plainTitle $ + metaValueToString <$> lookupMeta "toc-title" meta + uuid <- case epubIdentifier metadata of + (x:_) -> return $ identifierText x -- use first identifier as UUID + [] -> throwError $ PandocShouldNeverHappenError "epubIdentifier is null" -- shouldn't happen + currentTime <- lift $ P.getCurrentTime + let contentsData = UTF8.fromStringLazy $ ppTopElement $ + unode "package" ! [("version", case version of + EPUB2 -> "2.0" + EPUB3 -> "3.0") + ,("xmlns","http://www.idpf.org/2007/opf") + ,("unique-identifier","epub-id-1")] $ + [ metadataElement version metadata currentTime + , unode "manifest" $ + [ unode "item" ! [("id","ncx"), ("href","toc.ncx") + ,("media-type","application/x-dtbncx+xml")] $ () + , unode "item" ! [("id","style"), ("href","stylesheet.css") + ,("media-type","text/css")] $ () + , unode "item" ! ([("id","nav") + ,("href","nav.xhtml") + ,("media-type","application/xhtml+xml")] ++ + [("properties","nav") | epub3 ]) $ () + ] ++ + map chapterNode (cpgEntry ++ (tpEntry : chapterEntries)) ++ + (case cpicEntry of + [] -> [] + (x:_) -> [add_attrs + [Attr (unqual "properties") "cover-image" | epub3] + (pictureNode x)]) ++ + map pictureNode picEntries ++ + map fontNode fontEntries + , unode "spine" ! ([("toc","ncx")] ++ progressionDirection) $ + case epubCoverImage metadata of + Nothing -> [] + Just _ -> [ unode "itemref" ! + [("idref", "cover_xhtml")] $ () ] + ++ ((unode "itemref" ! [("idref", "title_page_xhtml") + ,("linear", + case lookupMeta "title" meta of + Just _ -> "yes" + Nothing -> "no")] $ ()) : + [unode "itemref" ! [("idref", "nav")] $ () + | writerTableOfContents opts ] ++ + map chapterRefNode chapterEntries) + , unode "guide" $ + [ unode "reference" ! + [("type","toc"),("title", tocTitle), + ("href","nav.xhtml")] $ () + ] ++ + [ unode "reference" ! + [("type","cover"),("title","Cover"),("href","cover.xhtml")] $ () | epubCoverImage metadata /= Nothing + ] + ] + let contentsEntry = mkEntry "content.opf" contentsData + + -- toc.ncx + let secs = hierarchicalize blocks' + + let tocLevel = writerTOCDepth opts + + let navPointNode :: PandocMonad m + => (Int -> String -> String -> [Element] -> Element) + -> S.Element -> StateT Int m Element + navPointNode formatter (S.Sec _ nums (ident,_,_) ils children) = do + n <- get + modify (+1) + let showNums :: [Int] -> String + showNums = intercalate "." . map show + let tit' = stringify ils + let tit = if writerNumberSections opts && not (null nums) + then showNums nums ++ " " ++ tit' + else tit' + src <- case lookup ident reftable of + Just x -> return x + Nothing -> throwError $ PandocSomeError $ ident ++ " not found in reftable" + let isSec (S.Sec lev _ _ _ _) = lev <= tocLevel + isSec _ = False + let subsecs = filter isSec children + subs <- mapM (navPointNode formatter) subsecs + return $ formatter n tit src subs + navPointNode _ (S.Blk _) = throwError $ PandocSomeError "navPointNode encountered Blk" + + let navMapFormatter :: Int -> String -> String -> [Element] -> Element + navMapFormatter n tit src subs = unode "navPoint" ! + [("id", "navPoint-" ++ show n)] $ + [ unode "navLabel" $ unode "text" tit + , unode "content" ! [("src", src)] $ () + ] ++ subs + + let tpNode = unode "navPoint" ! [("id", "navPoint-0")] $ + [ unode "navLabel" $ unode "text" (stringify $ docTitle' meta) + , unode "content" ! [("src","title_page.xhtml")] $ () ] + + navMap <- lift $ evalStateT (mapM (navPointNode navMapFormatter) secs) 1 + let tocData = UTF8.fromStringLazy $ ppTopElement $ + unode "ncx" ! [("version","2005-1") + ,("xmlns","http://www.daisy.org/z3986/2005/ncx/")] $ + [ unode "head" $ + [ unode "meta" ! [("name","dtb:uid") + ,("content", uuid)] $ () + , unode "meta" ! [("name","dtb:depth") + ,("content", "1")] $ () + , unode "meta" ! [("name","dtb:totalPageCount") + ,("content", "0")] $ () + , unode "meta" ! [("name","dtb:maxPageNumber") + ,("content", "0")] $ () + ] ++ case epubCoverImage metadata of + Nothing -> [] + Just img -> [unode "meta" ! [("name","cover"), + ("content", toId img)] $ ()] + , unode "docTitle" $ unode "text" $ plainTitle + , unode "navMap" $ + tpNode : navMap + ] + let tocEntry = mkEntry "toc.ncx" tocData + + let navXhtmlFormatter :: Int -> String -> String -> [Element] -> Element + navXhtmlFormatter n tit src subs = unode "li" ! + [("id", "toc-li-" ++ show n)] $ + (unode "a" ! [("href",src)] + $ tit) + : case subs of + [] -> [] + (_:_) -> [unode "ol" ! [("class","toc")] $ subs] + + let navtag = if epub3 then "nav" else "div" + tocBlocks <- lift $ evalStateT (mapM (navPointNode navXhtmlFormatter) secs) 1 + let navBlocks = [RawBlock (Format "html") $ ppElement $ + unode navtag ! ([("epub:type","toc") | epub3] ++ + [("id","toc")]) $ + [ unode "h1" ! [("id","toc-title")] $ tocTitle + , unode "ol" ! [("class","toc")] $ tocBlocks ]] + let landmarks = if epub3 + then [RawBlock (Format "html") $ ppElement $ + unode "nav" ! [("epub:type","landmarks") + ,("hidden","hidden")] $ + [ unode "ol" $ + [ unode "li" + [ unode "a" ! [("href", "cover.xhtml") + ,("epub:type", "cover")] $ + "Cover"] | + epubCoverImage metadata /= Nothing + ] ++ + [ unode "li" + [ unode "a" ! [("href", "#toc") + ,("epub:type", "toc")] $ + "Table of contents" + ] | writerTableOfContents opts + ] + ] + ] + else [] + navData <- lift $ writeHtml opts'{ writerVariables = ("navpage","true"):vars } + (Pandoc (setMeta "title" + (walk removeNote $ fromList $ docTitle' meta) nullMeta) + (navBlocks ++ landmarks)) + let navEntry = mkEntry "nav.xhtml" navData + + -- mimetype + let mimetypeEntry = mkEntry "mimetype" $ UTF8.fromStringLazy "application/epub+zip" + + -- container.xml + let containerData = UTF8.fromStringLazy $ ppTopElement $ + unode "container" ! [("version","1.0") + ,("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")] $ + unode "rootfiles" $ + unode "rootfile" ! [("full-path","content.opf") + ,("media-type","application/oebps-package+xml")] $ () + let containerEntry = mkEntry "META-INF/container.xml" containerData + + -- com.apple.ibooks.display-options.xml + let apple = UTF8.fromStringLazy $ ppTopElement $ + unode "display_options" $ + unode "platform" ! [("name","*")] $ + unode "option" ! [("name","specified-fonts")] $ "true" + let appleEntry = mkEntry "META-INF/com.apple.ibooks.display-options.xml" apple + + -- stylesheet + stylesheet <- case epubStylesheet metadata of + Just (StylesheetPath fp) -> UTF8.toStringLazy <$> (lift $ P.readFileLazy fp) + Just (StylesheetContents s) -> return s + Nothing -> UTF8.toString `fmap` + (lift $ P.readDataFile (writerUserDataDir opts) "epub.css") + let stylesheetEntry = mkEntry "stylesheet.css" $ UTF8.fromStringLazy stylesheet + + -- construct archive + let archive = foldr addEntryToArchive emptyArchive + (mimetypeEntry : containerEntry : appleEntry : stylesheetEntry : tpEntry : + contentsEntry : tocEntry : navEntry : + (picEntries ++ cpicEntry ++ cpgEntry ++ chapterEntries ++ fontEntries)) + return $ fromArchive archive + +metadataElement :: EPUBVersion -> EPUBMetadata -> UTCTime -> Element +metadataElement version md currentTime = + unode "metadata" ! [("xmlns:dc","http://purl.org/dc/elements/1.1/") + ,("xmlns:opf","http://www.idpf.org/2007/opf")] $ mdNodes + where mdNodes = identifierNodes ++ titleNodes ++ dateNodes ++ languageNodes + ++ creatorNodes ++ contributorNodes ++ subjectNodes + ++ descriptionNodes ++ typeNodes ++ formatNodes + ++ publisherNodes ++ sourceNodes ++ relationNodes + ++ coverageNodes ++ rightsNodes ++ coverImageNodes + ++ modifiedNodes + withIds base f = concat . zipWith f (map (\x -> base ++ ('-' : show x)) + ([1..] :: [Int])) + identifierNodes = withIds "epub-id" toIdentifierNode $ + epubIdentifier md + titleNodes = withIds "epub-title" toTitleNode $ epubTitle md + dateNodes = if version == EPUB2 + then withIds "epub-date" toDateNode $ epubDate md + else -- epub3 allows only one dc:date + -- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-opf-dcdate + case epubDate md of + [] -> [] + (x:_) -> [dcNode "date" ! [("id","epub-date")] + $ dateText x] + languageNodes = [dcTag "language" $ epubLanguage md] + creatorNodes = withIds "epub-creator" (toCreatorNode "creator") $ + epubCreator md + contributorNodes = withIds "epub-contributor" + (toCreatorNode "contributor") $ epubContributor md + subjectNodes = map (dcTag "subject") $ epubSubject md + descriptionNodes = maybe [] (dcTag' "description") $ epubDescription md + typeNodes = maybe [] (dcTag' "type") $ epubType md + formatNodes = maybe [] (dcTag' "format") $ epubFormat md + publisherNodes = maybe [] (dcTag' "publisher") $ epubPublisher md + sourceNodes = maybe [] (dcTag' "source") $ epubSource md + relationNodes = maybe [] (dcTag' "relation") $ epubRelation md + coverageNodes = maybe [] (dcTag' "coverage") $ epubCoverage md + rightsNodes = maybe [] (dcTag' "rights") $ epubRights md + coverImageNodes = maybe [] + (\img -> [unode "meta" ! [("name","cover"), + ("content",toId img)] $ ()]) + $ epubCoverImage md + modifiedNodes = [ unode "meta" ! [("property", "dcterms:modified")] $ + (showDateTimeISO8601 currentTime) | version == EPUB3 ] + dcTag n s = unode ("dc:" ++ n) s + dcTag' n s = [dcTag n s] + toIdentifierNode id' (Identifier txt scheme) + | version == EPUB2 = [dcNode "identifier" ! + ([("id",id')] ++ maybe [] (\x -> [("opf:scheme", x)]) scheme) $ + txt] + | otherwise = [dcNode "identifier" ! [("id",id')] $ txt] ++ + maybe [] (\x -> [unode "meta" ! + [("refines",'#':id'),("property","identifier-type"), + ("scheme","onix:codelist5")] $ x]) + (schemeToOnix `fmap` scheme) + toCreatorNode s id' creator + | version == EPUB2 = [dcNode s ! + (("id",id') : + maybe [] (\x -> [("opf:file-as",x)]) (creatorFileAs creator) ++ + maybe [] (\x -> [("opf:role",x)]) + (creatorRole creator >>= toRelator)) $ creatorText creator] + | otherwise = [dcNode s ! [("id",id')] $ creatorText creator] ++ + maybe [] (\x -> [unode "meta" ! + [("refines",'#':id'),("property","file-as")] $ x]) + (creatorFileAs creator) ++ + maybe [] (\x -> [unode "meta" ! + [("refines",'#':id'),("property","role"), + ("scheme","marc:relators")] $ x]) + (creatorRole creator >>= toRelator) + toTitleNode id' title + | version == EPUB2 = [dcNode "title" ! + (("id",id') : + -- note: EPUB2 doesn't accept opf:title-type + maybe [] (\x -> [("opf:file-as",x)]) (titleFileAs title)) $ + titleText title] + | otherwise = [dcNode "title" ! [("id",id')] $ titleText title] + ++ + maybe [] (\x -> [unode "meta" ! + [("refines",'#':id'),("property","file-as")] $ x]) + (titleFileAs title) ++ + maybe [] (\x -> [unode "meta" ! + [("refines",'#':id'),("property","title-type")] $ x]) + (titleType title) + toDateNode id' date = [dcNode "date" ! + (("id",id') : + maybe [] (\x -> [("opf:event",x)]) (dateEvent date)) $ + dateText date] + schemeToOnix "ISBN-10" = "02" + schemeToOnix "GTIN-13" = "03" + schemeToOnix "UPC" = "04" + schemeToOnix "ISMN-10" = "05" + schemeToOnix "DOI" = "06" + schemeToOnix "LCCN" = "13" + schemeToOnix "GTIN-14" = "14" + schemeToOnix "ISBN-13" = "15" + schemeToOnix "Legal deposit number" = "17" + schemeToOnix "URN" = "22" + schemeToOnix "OCLC" = "23" + schemeToOnix "ISMN-13" = "25" + schemeToOnix "ISBN-A" = "26" + schemeToOnix "JP" = "27" + schemeToOnix "OLCC" = "28" + schemeToOnix _ = "01" + +showDateTimeISO8601 :: UTCTime -> String +showDateTimeISO8601 = formatTime defaultTimeLocale "%FT%TZ" + +transformTag :: PandocMonad m + => WriterOptions + -- -> IORef [(FilePath, (FilePath, Maybe Entry))] -- ^ (oldpath, newpath, entry) media + -> Tag String + -> E m (Tag String) +transformTag opts tag@(TagOpen name attr) + | name `elem` ["video", "source", "img", "audio"] && + lookup "data-external" attr == Nothing = do + let src = fromAttrib "src" tag + let poster = fromAttrib "poster" tag + newsrc <- modifyMediaRef opts src + newposter <- modifyMediaRef opts poster + let attr' = filter (\(x,_) -> x /= "src" && x /= "poster") attr ++ + [("src", newsrc) | not (null newsrc)] ++ + [("poster", newposter) | not (null newposter)] + return $ TagOpen name attr' +transformTag _ tag = return tag + +modifyMediaRef :: PandocMonad m + => WriterOptions + -> FilePath + -> E m FilePath +modifyMediaRef _ "" = return "" +modifyMediaRef opts oldsrc = do + media <- gets stMediaPaths + case lookup oldsrc media of + Just (n,_) -> return n + Nothing -> catchError + (do (img, mbMime) <- P.fetchItem (writerSourceURL opts) oldsrc + let new = "media/file" ++ show (length media) ++ + fromMaybe (takeExtension (takeWhile (/='?') oldsrc)) + (('.':) <$> (mbMime >>= extensionFromMimeType)) + epochtime <- floor `fmap` lift P.getPOSIXTime + let entry = toEntry new epochtime $ B.fromChunks . (:[]) $ img + modify $ \st -> st{ stMediaPaths = + (oldsrc, (new, Just entry)):media} + return new) + (\e -> do + report $ CouldNotFetchResource oldsrc (show e) + return oldsrc) + +transformBlock :: PandocMonad m + => WriterOptions + -- -> IORef [(FilePath, (FilePath, Maybe Entry))] -- ^ (oldpath, newpath, entry) media + -> Block + -> E m Block +transformBlock opts (RawBlock fmt raw) + | fmt == Format "html" = do + let tags = parseTags raw + tags' <- mapM (transformTag opts) tags + return $ RawBlock fmt (renderTags' tags') +transformBlock _ b = return b + +transformInline :: PandocMonad m + => WriterOptions + -- -> IORef [(FilePath, (FilePath, Maybe Entry))] -- ^ (oldpath, newpath) media + -> Inline + -> E m Inline +transformInline opts (Image attr lab (src,tit)) = do + newsrc <- modifyMediaRef opts src + return $ Image attr lab (newsrc, tit) +transformInline opts (x@(Math t m)) + | WebTeX url <- writerHTMLMathMethod opts = do + newsrc <- modifyMediaRef opts (url ++ urlEncode m) + let mathclass = if t == DisplayMath then "display" else "inline" + return $ Span ("",["math",mathclass],[]) [Image nullAttr [x] (newsrc, "")] +transformInline opts (RawInline fmt raw) + | fmt == Format "html" = do + let tags = parseTags raw + tags' <- mapM (transformTag opts) tags + return $ RawInline fmt (renderTags' tags') +transformInline _ x = return x + +(!) :: (t -> Element) -> [(String, String)] -> t -> Element +(!) f attrs n = add_attrs (map (\(k,v) -> Attr (unqual k) v) attrs) (f n) + +-- | Version of 'ppTopElement' that specifies UTF-8 encoding. +ppTopElement :: Element -> String +ppTopElement = ("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" ++) . unEntity . ppElement + -- unEntity removes numeric entities introduced by ppElement + -- (kindlegen seems to choke on these). + where unEntity [] = "" + unEntity ('&':'#':xs) = + let (ds,ys) = break (==';') xs + rest = drop 1 ys + in case safeRead ('\'':'\\':ds ++ "'") of + Just x -> x : unEntity rest + Nothing -> '&':'#':unEntity xs + unEntity (x:xs) = x : unEntity xs + +mediaTypeOf :: FilePath -> Maybe MimeType +mediaTypeOf x = + let mediaPrefixes = ["image", "video", "audio"] in + case getMimeType x of + Just y | any (`isPrefixOf` y) mediaPrefixes -> Just y + _ -> Nothing + +-- Returns filename for chapter number. +showChapter :: Int -> String +showChapter = printf "ch%03d.xhtml" + +-- Add identifiers to any headers without them. +addIdentifiers :: [Block] -> [Block] +addIdentifiers bs = evalState (mapM go bs) Set.empty + where go (Header n (ident,classes,kvs) ils) = do + ids <- get + let ident' = if null ident + then uniqueIdent ils ids + else ident + modify $ Set.insert ident' + return $ Header n (ident',classes,kvs) ils + go x = return x + +-- Variant of normalizeDate that allows partial dates: YYYY, YYYY-MM +normalizeDate' :: String -> Maybe String +normalizeDate' xs = + let xs' = trim xs in + case xs' of + [y1,y2,y3,y4] | all isDigit [y1,y2,y3,y4] -> Just xs' -- YYYY + [y1,y2,y3,y4,'-',m1,m2] | all isDigit [y1,y2,y3,y4,m1,m2] -- YYYY-MM + -> Just xs' + _ -> normalizeDate xs' + +toRelator :: String -> Maybe String +toRelator x + | x `elem` relators = Just x + | otherwise = lookup (map toLower x) relatorMap + +relators :: [String] +relators = map snd relatorMap + +relatorMap :: [(String, String)] +relatorMap = + [("abridger", "abr") + ,("actor", "act") + ,("adapter", "adp") + ,("addressee", "rcp") + ,("analyst", "anl") + ,("animator", "anm") + ,("annotator", "ann") + ,("appellant", "apl") + ,("appellee", "ape") + ,("applicant", "app") + ,("architect", "arc") + ,("arranger", "arr") + ,("art copyist", "acp") + ,("art director", "adi") + ,("artist", "art") + ,("artistic director", "ard") + ,("assignee", "asg") + ,("associated name", "asn") + ,("attributed name", "att") + ,("auctioneer", "auc") + ,("author", "aut") + ,("author in quotations or text abstracts", "aqt") + ,("author of afterword, colophon, etc.", "aft") + ,("author of dialog", "aud") + ,("author of introduction, etc.", "aui") + ,("autographer", "ato") + ,("bibliographic antecedent", "ant") + ,("binder", "bnd") + ,("binding designer", "bdd") + ,("blurb writer", "blw") + ,("book designer", "bkd") + ,("book producer", "bkp") + ,("bookjacket designer", "bjd") + ,("bookplate designer", "bpd") + ,("bookseller", "bsl") + ,("braille embosser", "brl") + ,("broadcaster", "brd") + ,("calligrapher", "cll") + ,("cartographer", "ctg") + ,("caster", "cas") + ,("censor", "cns") + ,("choreographer", "chr") + ,("cinematographer", "cng") + ,("client", "cli") + ,("collection registrar", "cor") + ,("collector", "col") + ,("collotyper", "clt") + ,("colorist", "clr") + ,("commentator", "cmm") + ,("commentator for written text", "cwt") + ,("compiler", "com") + ,("complainant", "cpl") + ,("complainant-appellant", "cpt") + ,("complainant-appellee", "cpe") + ,("composer", "cmp") + ,("compositor", "cmt") + ,("conceptor", "ccp") + ,("conductor", "cnd") + ,("conservator", "con") + ,("consultant", "csl") + ,("consultant to a project", "csp") + ,("contestant", "cos") + ,("contestant-appellant", "cot") + ,("contestant-appellee", "coe") + ,("contestee", "cts") + ,("contestee-appellant", "ctt") + ,("contestee-appellee", "cte") + ,("contractor", "ctr") + ,("contributor", "ctb") + ,("copyright claimant", "cpc") + ,("copyright holder", "cph") + ,("corrector", "crr") + ,("correspondent", "crp") + ,("costume designer", "cst") + ,("court governed", "cou") + ,("court reporter", "crt") + ,("cover designer", "cov") + ,("creator", "cre") + ,("curator", "cur") + ,("dancer", "dnc") + ,("data contributor", "dtc") + ,("data manager", "dtm") + ,("dedicatee", "dte") + ,("dedicator", "dto") + ,("defendant", "dfd") + ,("defendant-appellant", "dft") + ,("defendant-appellee", "dfe") + ,("degree granting institution", "dgg") + ,("delineator", "dln") + ,("depicted", "dpc") + ,("depositor", "dpt") + ,("designer", "dsr") + ,("director", "drt") + ,("dissertant", "dis") + ,("distribution place", "dbp") + ,("distributor", "dst") + ,("donor", "dnr") + ,("draftsman", "drm") + ,("dubious author", "dub") + ,("editor", "edt") + ,("editor of compilation", "edc") + ,("editor of moving image work", "edm") + ,("electrician", "elg") + ,("electrotyper", "elt") + ,("enacting jurisdiction", "enj") + ,("engineer", "eng") + ,("engraver", "egr") + ,("etcher", "etr") + ,("event place", "evp") + ,("expert", "exp") + ,("facsimilist", "fac") + ,("field director", "fld") + ,("film director", "fmd") + ,("film distributor", "fds") + ,("film editor", "flm") + ,("film producer", "fmp") + ,("filmmaker", "fmk") + ,("first party", "fpy") + ,("forger", "frg") + ,("former owner", "fmo") + ,("funder", "fnd") + ,("geographic information specialist", "gis") + ,("honoree", "hnr") + ,("host", "hst") + ,("host institution", "his") + ,("illuminator", "ilu") + ,("illustrator", "ill") + ,("inscriber", "ins") + ,("instrumentalist", "itr") + ,("interviewee", "ive") + ,("interviewer", "ivr") + ,("inventor", "inv") + ,("issuing body", "isb") + ,("judge", "jud") + ,("jurisdiction governed", "jug") + ,("laboratory", "lbr") + ,("laboratory director", "ldr") + ,("landscape architect", "lsa") + ,("lead", "led") + ,("lender", "len") + ,("libelant", "lil") + ,("libelant-appellant", "lit") + ,("libelant-appellee", "lie") + ,("libelee", "lel") + ,("libelee-appellant", "let") + ,("libelee-appellee", "lee") + ,("librettist", "lbt") + ,("licensee", "lse") + ,("licensor", "lso") + ,("lighting designer", "lgd") + ,("lithographer", "ltg") + ,("lyricist", "lyr") + ,("manufacture place", "mfp") + ,("manufacturer", "mfr") + ,("marbler", "mrb") + ,("markup editor", "mrk") + ,("metadata contact", "mdc") + ,("metal-engraver", "mte") + ,("moderator", "mod") + ,("monitor", "mon") + ,("music copyist", "mcp") + ,("musical director", "msd") + ,("musician", "mus") + ,("narrator", "nrt") + ,("onscreen presenter", "osp") + ,("opponent", "opn") + ,("organizer of meeting", "orm") + ,("originator", "org") + ,("other", "oth") + ,("owner", "own") + ,("panelist", "pan") + ,("papermaker", "ppm") + ,("patent applicant", "pta") + ,("patent holder", "pth") + ,("patron", "pat") + ,("performer", "prf") + ,("permitting agency", "pma") + ,("photographer", "pht") + ,("plaintiff", "ptf") + ,("plaintiff-appellant", "ptt") + ,("plaintiff-appellee", "pte") + ,("platemaker", "plt") + ,("praeses", "pra") + ,("presenter", "pre") + ,("printer", "prt") + ,("printer of plates", "pop") + ,("printmaker", "prm") + ,("process contact", "prc") + ,("producer", "pro") + ,("production company", "prn") + ,("production designer", "prs") + ,("production manager", "pmn") + ,("production personnel", "prd") + ,("production place", "prp") + ,("programmer", "prg") + ,("project director", "pdr") + ,("proofreader", "pfr") + ,("provider", "prv") + ,("publication place", "pup") + ,("publisher", "pbl") + ,("publishing director", "pbd") + ,("puppeteer", "ppt") + ,("radio director", "rdd") + ,("radio producer", "rpc") + ,("recording engineer", "rce") + ,("recordist", "rcd") + ,("redaktor", "red") + ,("renderer", "ren") + ,("reporter", "rpt") + ,("repository", "rps") + ,("research team head", "rth") + ,("research team member", "rtm") + ,("researcher", "res") + ,("respondent", "rsp") + ,("respondent-appellant", "rst") + ,("respondent-appellee", "rse") + ,("responsible party", "rpy") + ,("restager", "rsg") + ,("restorationist", "rsr") + ,("reviewer", "rev") + ,("rubricator", "rbr") + ,("scenarist", "sce") + ,("scientific advisor", "sad") + ,("screenwriter", "aus") + ,("scribe", "scr") + ,("sculptor", "scl") + ,("second party", "spy") + ,("secretary", "sec") + ,("seller", "sll") + ,("set designer", "std") + ,("setting", "stg") + ,("signer", "sgn") + ,("singer", "sng") + ,("sound designer", "sds") + ,("speaker", "spk") + ,("sponsor", "spn") + ,("stage director", "sgd") + ,("stage manager", "stm") + ,("standards body", "stn") + ,("stereotyper", "str") + ,("storyteller", "stl") + ,("supporting host", "sht") + ,("surveyor", "srv") + ,("teacher", "tch") + ,("technical director", "tcd") + ,("television director", "tld") + ,("television producer", "tlp") + ,("thesis advisor", "ths") + ,("transcriber", "trc") + ,("translator", "trl") + ,("type designer", "tyd") + ,("typographer", "tyg") + ,("university place", "uvp") + ,("videographer", "vdg") + ,("witness", "wit") + ,("wood engraver", "wde") + ,("woodcutter", "wdc") + ,("writer of accompanying material", "wam") + ,("writer of added commentary", "wac") + ,("writer of added lyrics", "wal") + ,("writer of added text", "wat") + ] + +docTitle' :: Meta -> [Inline] +docTitle' meta = fromMaybe [] $ go <$> lookupMeta "title" meta + where go (MetaString s) = [Str s] + go (MetaInlines xs) = xs + go (MetaBlocks [Para xs]) = xs + go (MetaBlocks [Plain xs]) = xs + go (MetaMap m) = + case M.lookup "type" m of + Just x | stringify x == "main" -> + maybe [] go $ M.lookup "text" m + _ -> [] + go (MetaList xs) = concatMap go xs + go _ = [] diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs new file mode 100644 index 000000000..967fe6a4c --- /dev/null +++ b/src/Text/Pandoc/Writers/FB2.hs @@ -0,0 +1,617 @@ +{-# LANGUAGE PatternGuards #-} + +{- +Copyright (c) 2011-2012, Sergey Astanin +All rights reserved. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | Conversion of 'Pandoc' documents to FB2 (FictionBook2) format. + +FictionBook is an XML-based e-book format. For more information see: +<http://www.fictionbook.org/index.php/Eng:XML_Schema_Fictionbook_2.1> + +-} +module Text.Pandoc.Writers.FB2 (writeFB2) where + +import Control.Monad.State (StateT, evalStateT, get, modify, lift) +import Control.Monad.State (liftM) +import Data.ByteString.Base64 (encode) +import Data.Char (toLower, isSpace, isAscii, isControl) +import Data.List (intersperse, intercalate, isPrefixOf, stripPrefix) +import Data.Either (lefts, rights) +import Network.HTTP (urlEncode) +import Network.URI (isURI) +import Text.XML.Light +import qualified Text.XML.Light as X +import qualified Text.XML.Light.Cursor as XC +import qualified Data.ByteString.Char8 as B8 +import Control.Monad.Except (throwError, catchError) + +import Text.Pandoc.Logging +import Text.Pandoc.Definition +import Text.Pandoc.Options (WriterOptions(..), HTMLMathMethod(..), def) +import Text.Pandoc.Shared (orderedListMarkers, isHeaderBlock, capitalize, + linesToPara) +import Text.Pandoc.Error +import Text.Pandoc.Class (PandocMonad, report) +import qualified Text.Pandoc.Class as P + +-- | Data to be written at the end of the document: +-- (foot)notes, URLs, references, images. +data FbRenderState = FbRenderState + { footnotes :: [ (Int, String, [Content]) ] -- ^ #, ID, text + , imagesToFetch :: [ (String, String) ] -- ^ filename, URL or path + , parentListMarker :: String -- ^ list marker of the parent ordered list + , parentBulletLevel :: Int -- ^ nesting level of the unordered list + , writerOptions :: WriterOptions + } deriving (Show) + +-- | FictionBook building monad. +type FBM m = StateT FbRenderState m + +newFB :: FbRenderState +newFB = FbRenderState { footnotes = [], imagesToFetch = [] + , parentListMarker = "", parentBulletLevel = 0 + , writerOptions = def } + +data ImageMode = NormalImage | InlineImage deriving (Eq) +instance Show ImageMode where + show NormalImage = "imageType" + show InlineImage = "inlineImageType" + +-- | Produce an FB2 document from a 'Pandoc' document. +writeFB2 :: PandocMonad m + => WriterOptions -- ^ conversion options + -> Pandoc -- ^ document to convert + -> m String -- ^ FictionBook2 document (not encoded yet) +writeFB2 opts doc = flip evalStateT newFB $ pandocToFB2 opts doc + +pandocToFB2 :: PandocMonad m + => WriterOptions + -> Pandoc + -> FBM m String +pandocToFB2 opts (Pandoc meta blocks) = do + modify (\s -> s { writerOptions = opts }) + desc <- description meta + fp <- frontpage meta + secs <- renderSections 1 blocks + let body = el "body" $ fp ++ secs + notes <- renderFootnotes + (imgs,missing) <- liftM imagesToFetch get >>= \s -> lift (fetchImages s) + let body' = replaceImagesWithAlt missing body + let fb2_xml = el "FictionBook" (fb2_attrs, [desc, body'] ++ notes ++ imgs) + return $ xml_head ++ (showContent fb2_xml) ++ "\n" + where + xml_head = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + fb2_attrs = + let xmlns = "http://www.gribuser.ru/xml/fictionbook/2.0" + xlink = "http://www.w3.org/1999/xlink" + in [ uattr "xmlns" xmlns + , attr ("xmlns", "l") xlink ] + +frontpage :: PandocMonad m => Meta -> FBM m [Content] +frontpage meta' = do + t <- cMapM toXml . docTitle $ meta' + return $ + [ el "title" (el "p" t) + , el "annotation" (map (el "p" . cMap plain) + (docAuthors meta' ++ [docDate meta'])) + ] + +description :: PandocMonad m => Meta -> FBM m Content +description meta' = do + bt <- booktitle meta' + let as = authors meta' + dd <- docdate meta' + return $ el "description" + [ el "title-info" (bt ++ as ++ dd) + , el "document-info" [ el "program-used" "pandoc" ] -- FIXME: +version + ] + +booktitle :: PandocMonad m => Meta -> FBM m [Content] +booktitle meta' = do + t <- cMapM toXml . docTitle $ meta' + return $ if null t + then [] + else [ el "book-title" t ] + +authors :: Meta -> [Content] +authors meta' = cMap author (docAuthors meta') + +author :: [Inline] -> [Content] +author ss = + let ws = words . cMap plain $ ss + email = (el "email") `fmap` (take 1 $ filter ('@' `elem`) ws) + ws' = filter ('@' `notElem`) ws + names = case ws' of + (nickname:[]) -> [ el "nickname" nickname ] + (fname:lname:[]) -> [ el "first-name" fname + , el "last-name" lname ] + (fname:rest) -> [ el "first-name" fname + , el "middle-name" (concat . init $ rest) + , el "last-name" (last rest) ] + ([]) -> [] + in list $ el "author" (names ++ email) + +docdate :: PandocMonad m => Meta -> FBM m [Content] +docdate meta' = do + let ss = docDate meta' + d <- cMapM toXml ss + return $ if null d + then [] + else [el "date" d] + +-- | Divide the stream of blocks into sections and convert to XML +-- representation. +renderSections :: PandocMonad m => Int -> [Block] -> FBM m [Content] +renderSections level blocks = do + let secs = splitSections level blocks + mapM (renderSection level) secs + +renderSection :: PandocMonad m => Int -> ([Inline], [Block]) -> FBM m Content +renderSection level (ttl, body) = do + title <- if null ttl + then return [] + else return . list . el "title" . formatTitle $ ttl + content <- if (hasSubsections body) + then renderSections (level + 1) body + else cMapM blockToXml body + return $ el "section" (title ++ content) + where + hasSubsections = any isHeaderBlock + +-- | Only <p> and <empty-line> are allowed within <title> in FB2. +formatTitle :: [Inline] -> [Content] +formatTitle inlines = + let lns = split isLineBreak inlines + lns' = map (el "p" . cMap plain) lns + in intersperse (el "empty-line" ()) lns' + +split :: (a -> Bool) -> [a] -> [[a]] +split _ [] = [] +split cond xs = let (b,a) = break cond xs + in (b:split cond (drop 1 a)) + +isLineBreak :: Inline -> Bool +isLineBreak LineBreak = True +isLineBreak _ = False + +-- | Divide the stream of block elements into sections: [(title, blocks)]. +splitSections :: Int -> [Block] -> [([Inline], [Block])] +splitSections level blocks = reverse $ revSplit (reverse blocks) + where + revSplit [] = [] + revSplit rblocks = + let (lastsec, before) = break sameLevel rblocks + (header, prevblocks) = + case before of + ((Header n _ title):prevblocks') -> + if n == level + then (title, prevblocks') + else ([], before) + _ -> ([], before) + in (header, reverse lastsec) : revSplit prevblocks + sameLevel (Header n _ _) = n == level + sameLevel _ = False + +-- | Make another FictionBook body with footnotes. +renderFootnotes :: PandocMonad m => FBM m [Content] +renderFootnotes = do + fns <- footnotes `liftM` get + if null fns + then return [] -- no footnotes + else return . list $ + el "body" ([uattr "name" "notes"], map renderFN (reverse fns)) + where + renderFN (n, idstr, cs) = + let fn_texts = (el "title" (el "p" (show n))) : cs + in el "section" ([uattr "id" idstr], fn_texts) + +-- | Fetch images and encode them for the FictionBook XML. +-- Return image data and a list of hrefs of the missing images. +fetchImages :: PandocMonad m => [(String,String)] -> m ([Content],[String]) +fetchImages links = do + imgs <- mapM (uncurry fetchImage) links + return $ (rights imgs, lefts imgs) + +-- | Fetch image data from disk or from network and make a <binary> XML section. +-- Return either (Left hrefOfMissingImage) or (Right xmlContent). +fetchImage :: PandocMonad m => String -> String -> m (Either String Content) +fetchImage href link = do + mbimg <- + case (isURI link, readDataURI link) of + (True, Just (mime,_,True,base64)) -> + let mime' = map toLower mime + in if mime' == "image/png" || mime' == "image/jpeg" + then return (Just (mime',base64)) + else return Nothing + (True, Just _) -> return Nothing -- not base64-encoded + _ -> do + catchError (do (bs, mbmime) <- P.fetchItem Nothing link + case mbmime of + Nothing -> do + report $ CouldNotDetermineMimeType link + return Nothing + Just mime -> return $ Just (mime, + B8.unpack $ encode bs)) + (\e -> + do report $ CouldNotFetchResource link (show e) + return Nothing) + case mbimg of + Just (imgtype, imgdata) -> do + return . Right $ el "binary" + ( [uattr "id" href + , uattr "content-type" imgtype] + , txt imgdata ) + _ -> return (Left ('#':href)) + + +-- | Extract mime type and encoded data from the Data URI. +readDataURI :: String -- ^ URI + -> Maybe (String,String,Bool,String) + -- ^ Maybe (mime,charset,isBase64,data) +readDataURI uri = + case stripPrefix "data:" uri of + Nothing -> Nothing + Just rest -> + let meta = takeWhile (/= ',') rest -- without trailing ',' + uridata = drop (length meta + 1) rest + parts = split (== ';') meta + (mime,cs,enc)=foldr upd ("text/plain","US-ASCII",False) parts + in Just (mime,cs,enc,uridata) + + where + upd str m@(mime,cs,enc) + | isMimeType str = (str,cs,enc) + | Just str' <- stripPrefix "charset=" str = (mime,str',enc) + | str == "base64" = (mime,cs,True) + | otherwise = m + +-- Without parameters like ;charset=...; see RFC 2045, 5.1 +isMimeType :: String -> Bool +isMimeType s = + case split (=='/') s of + [mtype,msubtype] -> + ((map toLower mtype) `elem` types + || "x-" `isPrefixOf` (map toLower mtype)) + && all valid mtype + && all valid msubtype + _ -> False + where + types = ["text","image","audio","video","application","message","multipart"] + valid c = isAscii c && not (isControl c) && not (isSpace c) && + c `notElem` "()<>@,;:\\\"/[]?=" + +footnoteID :: Int -> String +footnoteID i = "n" ++ (show i) + +linkID :: Int -> String +linkID i = "l" ++ (show i) + +-- | Convert a block-level Pandoc's element to FictionBook XML representation. +blockToXml :: PandocMonad m => Block -> FBM m [Content] +blockToXml (Plain ss) = cMapM toXml ss -- FIXME: can lead to malformed FB2 +blockToXml (Para [Math DisplayMath formula]) = insertMath NormalImage formula +-- title beginning with fig: indicates that the image is a figure +blockToXml (Para [Image atr alt (src,'f':'i':'g':':':tit)]) = + insertImage NormalImage (Image atr alt (src,tit)) +blockToXml (Para ss) = liftM (list . el "p") $ cMapM toXml ss +blockToXml (CodeBlock _ s) = return . spaceBeforeAfter . + map (el "p" . el "code") . lines $ s +blockToXml b@(RawBlock _ _) = do + report $ BlockNotRendered b + return [] +blockToXml (Div _ bs) = cMapM blockToXml bs +blockToXml (BlockQuote bs) = liftM (list . el "cite") $ cMapM blockToXml bs +blockToXml (LineBlock lns) = blockToXml $ linesToPara lns +blockToXml (OrderedList a bss) = do + state <- get + let pmrk = parentListMarker state + let markers = map ((pmrk ++ " ") ++) $ orderedListMarkers a + let mkitem mrk bs = do + modify (\s -> s { parentListMarker = mrk }) + itemtext <- cMapM blockToXml . paraToPlain $ bs + modify (\s -> s { parentListMarker = pmrk }) -- old parent marker + return . el "p" $ [ txt mrk, txt " " ] ++ itemtext + mapM (uncurry mkitem) (zip markers bss) +blockToXml (BulletList bss) = do + state <- get + let level = parentBulletLevel state + let pmrk = parentListMarker state + let prefix = replicate (length pmrk) ' ' + let bullets = ["\x2022", "\x25e6", "*", "\x2043", "\x2023"] + let mrk = prefix ++ bullets !! (level `mod` (length bullets)) + let mkitem bs = do + modify (\s -> s { parentBulletLevel = (level+1) }) + itemtext <- cMapM blockToXml . paraToPlain $ bs + modify (\s -> s { parentBulletLevel = level }) -- restore bullet level + return $ el "p" $ [ txt (mrk ++ " ") ] ++ itemtext + mapM mkitem bss +blockToXml (DefinitionList defs) = + cMapM mkdef defs + where + mkdef (term, bss) = do + def' <- cMapM (cMapM blockToXml . sep . paraToPlain . map indent) bss + t <- wrap "strong" term + return [ el "p" t, el "p" def' ] + sep blocks = + if all needsBreak blocks then + blocks ++ [Plain [LineBreak]] + else + blocks + needsBreak (Para _) = False + needsBreak (Plain ins) = LineBreak `notElem` ins + needsBreak _ = True +blockToXml (Header _ _ _) = -- should never happen, see renderSections + throwError $ PandocShouldNeverHappenError "unexpected header in section text" +blockToXml HorizontalRule = return + [ el "empty-line" () + , el "p" (txt (replicate 10 '—')) + , el "empty-line" () ] +blockToXml (Table caption aligns _ headers rows) = do + hd <- mkrow "th" headers aligns + bd <- mapM (\r -> mkrow "td" r aligns) rows + c <- return . el "emphasis" =<< cMapM toXml caption + return [el "table" (hd : bd), el "p" c] + where + mkrow :: PandocMonad m => String -> [TableCell] -> [Alignment] -> FBM m Content + mkrow tag cells aligns' = + (el "tr") `liftM` (mapM (mkcell tag) (zip cells aligns')) + -- + mkcell :: PandocMonad m => String -> (TableCell, Alignment) -> FBM m Content + mkcell tag (cell, align) = do + cblocks <- cMapM blockToXml cell + return $ el tag ([align_attr align], cblocks) + -- + align_attr a = Attr (QName "align" Nothing Nothing) (align_str a) + align_str AlignLeft = "left" + align_str AlignCenter = "center" + align_str AlignRight = "right" + align_str AlignDefault = "left" +blockToXml Null = return [] + +-- Replace paragraphs with plain text and line break. +-- Necessary to simulate multi-paragraph lists in FB2. +paraToPlain :: [Block] -> [Block] +paraToPlain [] = [] +paraToPlain (Para inlines : rest) = + let p = (Plain (inlines ++ [LineBreak])) + in p : paraToPlain rest +paraToPlain (p:rest) = p : paraToPlain rest + +-- Simulate increased indentation level. Will not really work +-- for multi-line paragraphs. +indent :: Block -> Block +indent = indentBlock + where + -- indentation space + spacer :: String + spacer = replicate 4 ' ' + -- + indentBlock (Plain ins) = Plain ((Str spacer):ins) + indentBlock (Para ins) = Para ((Str spacer):ins) + indentBlock (CodeBlock a s) = + let s' = unlines . map (spacer++) . lines $ s + in CodeBlock a s' + indentBlock (BlockQuote bs) = BlockQuote (map indent bs) + indentBlock (Header l attr' ins) = Header l attr' (indentLines ins) + indentBlock everythingElse = everythingElse + -- indent every (explicit) line + indentLines :: [Inline] -> [Inline] + indentLines ins = let lns = split isLineBreak ins :: [[Inline]] + in intercalate [LineBreak] $ map ((Str spacer):) lns + +-- | Convert a Pandoc's Inline element to FictionBook XML representation. +toXml :: PandocMonad m => Inline -> FBM m [Content] +toXml (Str s) = return [txt s] +toXml (Span _ ils) = cMapM toXml ils +toXml (Emph ss) = list `liftM` wrap "emphasis" ss +toXml (Strong ss) = list `liftM` wrap "strong" ss +toXml (Strikeout ss) = list `liftM` wrap "strikethrough" ss +toXml (Superscript ss) = list `liftM` wrap "sup" ss +toXml (Subscript ss) = list `liftM` wrap "sub" ss +toXml (SmallCaps ss) = cMapM toXml $ capitalize ss +toXml (Quoted SingleQuote ss) = do -- FIXME: should be language-specific + inner <- cMapM toXml ss + return $ [txt "‘"] ++ inner ++ [txt "’"] +toXml (Quoted DoubleQuote ss) = do + inner <- cMapM toXml ss + return $ [txt "“"] ++ inner ++ [txt "”"] +toXml (Cite _ ss) = cMapM toXml ss -- FIXME: support citation styles +toXml (Code _ s) = return [el "code" s] +toXml Space = return [txt " "] +toXml SoftBreak = return [txt " "] +toXml LineBreak = return [el "empty-line" ()] +toXml (Math _ formula) = insertMath InlineImage formula +toXml il@(RawInline _ _) = do + report $ InlineNotRendered il + return [] -- raw TeX and raw HTML are suppressed +toXml (Link _ text (url,ttl)) = do + fns <- footnotes `liftM` get + let n = 1 + length fns + let ln_id = linkID n + let ln_ref = list . el "sup" . txt $ "[" ++ show n ++ "]" + ln_text <- cMapM toXml text + let ln_desc = + let ttl' = dropWhile isSpace ttl + in if null ttl' + then list . el "p" $ el "code" url + else list . el "p" $ [ txt (ttl' ++ ": "), el "code" url ] + modify (\s -> s { footnotes = (n, ln_id, ln_desc) : fns }) + return $ ln_text ++ + [ el "a" + ( [ attr ("l","href") ('#':ln_id) + , uattr "type" "note" ] + , ln_ref) ] +toXml img@(Image _ _ _) = insertImage InlineImage img +toXml (Note bs) = do + fns <- footnotes `liftM` get + let n = 1 + length fns + let fn_id = footnoteID n + fn_desc <- cMapM blockToXml bs + modify (\s -> s { footnotes = (n, fn_id, fn_desc) : fns }) + let fn_ref = el "sup" . txt $ "[" ++ show n ++ "]" + return . list $ el "a" ( [ attr ("l","href") ('#':fn_id) + , uattr "type" "note" ] + , fn_ref ) + +insertMath :: PandocMonad m => ImageMode -> String -> FBM m [Content] +insertMath immode formula = do + htmlMath <- return . writerHTMLMathMethod . writerOptions =<< get + case htmlMath of + WebTeX url -> do + let alt = [Code nullAttr formula] + let imgurl = url ++ urlEncode formula + let img = Image nullAttr alt (imgurl, "") + insertImage immode img + _ -> return [el "code" formula] + +insertImage :: PandocMonad m => ImageMode -> Inline -> FBM m [Content] +insertImage immode (Image _ alt (url,ttl)) = do + images <- imagesToFetch `liftM` get + let n = 1 + length images + let fname = "image" ++ show n + modify (\s -> s { imagesToFetch = (fname, url) : images }) + let ttlattr = case (immode, null ttl) of + (NormalImage, False) -> [ uattr "title" ttl ] + _ -> [] + return . list $ + el "image" $ + [ attr ("l","href") ('#':fname) + , attr ("l","type") (show immode) + , uattr "alt" (cMap plain alt) ] + ++ ttlattr +insertImage _ _ = error "unexpected inline instead of image" + +replaceImagesWithAlt :: [String] -> Content -> Content +replaceImagesWithAlt missingHrefs body = + let cur = XC.fromContent body + cur' = replaceAll cur + in XC.toTree . XC.root $ cur' + where + -- + replaceAll :: XC.Cursor -> XC.Cursor + replaceAll c = + let n = XC.current c + c' = if isImage n && isMissing n + then XC.modifyContent replaceNode c + else c + in case XC.nextDF c' of + (Just cnext) -> replaceAll cnext + Nothing -> c' -- end of document + -- + isImage :: Content -> Bool + isImage (Elem e) = (elName e) == (uname "image") + isImage _ = False + -- + isMissing (Elem img@(Element _ _ _ _)) = + let imgAttrs = elAttribs img + badAttrs = map (attr ("l","href")) missingHrefs + in any (`elem` imgAttrs) badAttrs + isMissing _ = False + -- + replaceNode :: Content -> Content + replaceNode n@(Elem img@(Element _ _ _ _)) = + let attrs = elAttribs img + alt = getAttrVal attrs (uname "alt") + imtype = getAttrVal attrs (qname "l" "type") + in case (alt, imtype) of + (Just alt', Just imtype') -> + if imtype' == show NormalImage + then el "p" alt' + else txt alt' + (Just alt', Nothing) -> txt alt' -- no type attribute + _ -> n -- don't replace if alt text is not found + replaceNode n = n + -- + getAttrVal :: [X.Attr] -> QName -> Maybe String + getAttrVal attrs name = + case filter ((name ==) . attrKey) attrs of + (a:_) -> Just (attrVal a) + _ -> Nothing + + +-- | Wrap all inlines with an XML tag (given its unqualified name). +wrap :: PandocMonad m => String -> [Inline] -> FBM m Content +wrap tagname inlines = el tagname `liftM` cMapM toXml inlines + +-- " Create a singleton list. +list :: a -> [a] +list = (:[]) + +-- | Convert an 'Inline' to plaintext. +plain :: Inline -> String +plain (Str s) = s +plain (Emph ss) = concat (map plain ss) +plain (Span _ ss) = concat (map plain ss) +plain (Strong ss) = concat (map plain ss) +plain (Strikeout ss) = concat (map plain ss) +plain (Superscript ss) = concat (map plain ss) +plain (Subscript ss) = concat (map plain ss) +plain (SmallCaps ss) = concat (map plain ss) +plain (Quoted _ ss) = concat (map plain ss) +plain (Cite _ ss) = concat (map plain ss) -- FIXME +plain (Code _ s) = s +plain Space = " " +plain SoftBreak = " " +plain LineBreak = "\n" +plain (Math _ s) = s +plain (RawInline _ _) = "" +plain (Link _ text (url,_)) = concat (map plain text ++ [" <", url, ">"]) +plain (Image _ alt _) = concat (map plain alt) +plain (Note _) = "" -- FIXME + +-- | Create an XML element. +el :: (Node t) + => String -- ^ unqualified element name + -> t -- ^ node contents + -> Content -- ^ XML content +el name cs = Elem $ unode name cs + +-- | Put empty lines around content +spaceBeforeAfter :: [Content] -> [Content] +spaceBeforeAfter cs = + let emptyline = el "empty-line" () + in [emptyline] ++ cs ++ [emptyline] + +-- | Create a plain-text XML content. +txt :: String -> Content +txt s = Text $ CData CDataText s Nothing + +-- | Create an XML attribute with an unqualified name. +uattr :: String -> String -> Text.XML.Light.Attr +uattr name val = Attr (uname name) val + +-- | Create an XML attribute with a qualified name from given namespace. +attr :: (String, String) -> String -> Text.XML.Light.Attr +attr (ns, name) val = Attr (qname ns name) val + +-- | Unqualified name +uname :: String -> QName +uname name = QName name Nothing Nothing + +-- | Qualified name +qname :: String -> String -> QName +qname ns name = QName name Nothing (Just ns) + +-- | Abbreviation for 'concatMap'. +cMap :: (a -> [b]) -> [a] -> [b] +cMap = concatMap + +-- | Monadic equivalent of 'concatMap'. +cMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +cMapM f xs = concat `liftM` mapM f xs diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs new file mode 100644 index 000000000..99f8c5b42 --- /dev/null +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -0,0 +1,1069 @@ +{-# LANGUAGE OverloadedStrings, CPP, ViewPatterns, ScopedTypeVariables #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.HTML + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to HTML. +-} +module Text.Pandoc.Writers.HTML ( + writeHtml4, + writeHtml4String, + writeHtml5, + writeHtml5String, + writeHtmlStringForEPUB, + writeS5, + writeSlidy, + writeSlideous, + writeDZSlides, + writeRevealJs + ) where +import Text.Pandoc.Definition +import Text.Pandoc.Walk +import Data.Monoid ((<>)) +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import Text.Pandoc.ImageSize +import Text.Pandoc.Templates +import Text.Pandoc.Writers.Math +import Text.Pandoc.Slides +import Text.Pandoc.Highlighting ( highlight, styleToCss, + formatHtmlInline, formatHtmlBlock ) +import Text.Pandoc.XML (fromEntities, escapeStringForXML) +import Network.URI ( parseURIReference, URI(..), unEscapeString ) +import Network.HTTP ( urlEncode ) +import Numeric ( showHex ) +import Data.Char ( ord, toLower ) +import Data.List ( isPrefixOf, intersperse ) +import Data.String ( fromString ) +import Data.Maybe ( catMaybes, fromMaybe, isJust ) +import Control.Monad.State +import Text.Blaze.Html hiding(contents) +#if MIN_VERSION_blaze_markup(0,6,3) +#else +import Text.Blaze.Internal(preEscapedString) +#endif +#if MIN_VERSION_blaze_html(0,5,1) +import qualified Text.Blaze.XHtml5 as H5 +#else +import qualified Text.Blaze.Html5 as H5 +#endif +import qualified Text.Blaze.XHtml1.Transitional as H +import qualified Text.Blaze.XHtml1.Transitional.Attributes as A +import Text.Blaze.Html.Renderer.String (renderHtml) +import Text.TeXMath +import Text.XML.Light.Output +import Text.XML.Light (unode, elChildren, unqual) +import qualified Text.XML.Light as XML +import System.FilePath (takeExtension) +import Data.Aeson (Value) +import Control.Monad.Except (throwError) +import Text.Pandoc.Error +import Text.Pandoc.Class (PandocMonad, report) +import Text.Pandoc.Logging + +data WriterState = WriterState + { stNotes :: [Html] -- ^ List of notes + , stMath :: Bool -- ^ Math is used in document + , stQuotes :: Bool -- ^ <q> tag is used + , stHighlighting :: Bool -- ^ Syntax highlighting is used + , stSecNum :: [Int] -- ^ Number of current section + , stElement :: Bool -- ^ Processing an Element + , stHtml5 :: Bool -- ^ Use HTML5 + , stEPUBVersion :: Maybe EPUBVersion -- ^ EPUB version if for epub + , stSlideVariant :: HTMLSlideVariant + } + +defaultWriterState :: WriterState +defaultWriterState = WriterState {stNotes= [], stMath = False, stQuotes = False, + stHighlighting = False, stSecNum = [], + stElement = False, stHtml5 = False, + stEPUBVersion = Nothing, + stSlideVariant = NoSlides} + +-- Helpers to render HTML with the appropriate function. + +strToHtml :: String -> Html +strToHtml ('\'':xs) = preEscapedString "\'" `mappend` strToHtml xs +strToHtml xs@(_:_) = case break (=='\'') xs of + (_ ,[]) -> toHtml xs + (ys,zs) -> toHtml ys `mappend` strToHtml zs +strToHtml [] = "" + +-- | Hard linebreak. +nl :: WriterOptions -> Html +nl opts = if writerWrapText opts == WrapNone + then mempty + else preEscapedString "\n" + +-- | Convert Pandoc document to Html 5 string. +writeHtml5String :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeHtml5String = writeHtmlString' + defaultWriterState{ stHtml5 = True } + +-- | Convert Pandoc document to Html 5 structure. +writeHtml5 :: PandocMonad m => WriterOptions -> Pandoc -> m Html +writeHtml5 = writeHtml' defaultWriterState{ stHtml5 = True } + +-- | Convert Pandoc document to Html 4 string. +writeHtml4String :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeHtml4String = writeHtmlString' + defaultWriterState{ stHtml5 = False } + +-- | Convert Pandoc document to Html 4 structure. +writeHtml4 :: PandocMonad m => WriterOptions -> Pandoc -> m Html +writeHtml4 = writeHtml' defaultWriterState{ stHtml5 = False } + +-- | Convert Pandoc document to Html appropriate for an epub version. +writeHtmlStringForEPUB :: PandocMonad m + => EPUBVersion -> WriterOptions -> Pandoc -> m String +writeHtmlStringForEPUB version = writeHtmlString' + defaultWriterState{ stHtml5 = version == EPUB3, + stEPUBVersion = Just version } + +-- | Convert Pandoc document to Reveal JS HTML slide show. +writeRevealJs :: PandocMonad m + => WriterOptions -> Pandoc -> m String +writeRevealJs = writeHtmlSlideShow' RevealJsSlides + +-- | Convert Pandoc document to S5 HTML slide show. +writeS5 :: PandocMonad m + => WriterOptions -> Pandoc -> m String +writeS5 = writeHtmlSlideShow' S5Slides + +-- | Convert Pandoc document to Slidy HTML slide show. +writeSlidy :: PandocMonad m + => WriterOptions -> Pandoc -> m String +writeSlidy = writeHtmlSlideShow' SlidySlides + +-- | Convert Pandoc document to Slideous HTML slide show. +writeSlideous :: PandocMonad m + => WriterOptions -> Pandoc -> m String +writeSlideous = writeHtmlSlideShow' SlideousSlides + +-- | Convert Pandoc document to DZSlides HTML slide show. +writeDZSlides :: PandocMonad m + => WriterOptions -> Pandoc -> m String +writeDZSlides = writeHtmlSlideShow' DZSlides + +writeHtmlSlideShow' :: PandocMonad m + => HTMLSlideVariant -> WriterOptions -> Pandoc -> m String +writeHtmlSlideShow' variant = writeHtmlString' + defaultWriterState{ stSlideVariant = variant + , stHtml5 = case variant of + RevealJsSlides -> True + S5Slides -> False + SlidySlides -> False + DZSlides -> True + SlideousSlides -> False + NoSlides -> False + } + +writeHtmlString' :: PandocMonad m + => WriterState -> WriterOptions -> Pandoc -> m String +writeHtmlString' st opts d = do + (body, context) <- evalStateT (pandocToHtml opts d) st + return $ case writerTemplate opts of + Nothing -> renderHtml body + Just tpl -> renderTemplate' tpl $ + defField "body" (renderHtml body) context + +writeHtml' :: PandocMonad m => WriterState -> WriterOptions -> Pandoc -> m Html +writeHtml' st opts d = do + (body, context) <- evalStateT (pandocToHtml opts d) st + return $ case writerTemplate opts of + Nothing -> body + Just tpl -> renderTemplate' tpl $ + defField "body" (renderHtml body) context + +-- result is (title, authors, date, toc, body, new variables) +pandocToHtml :: PandocMonad m + => WriterOptions + -> Pandoc + -> StateT WriterState m (Html, Value) +pandocToHtml opts (Pandoc meta blocks) = do + metadata <- metaToJSON opts + (fmap renderHtml . blockListToHtml opts) + (fmap renderHtml . inlineListToHtml opts) + meta + let stringifyHTML = escapeStringForXML . stringify + let authsMeta = map stringifyHTML $ docAuthors meta + let dateMeta = stringifyHTML $ docDate meta + let slideLevel = fromMaybe (getSlideLevel blocks) $ writerSlideLevel opts + slideVariant <- gets stSlideVariant + let sects = hierarchicalize $ + if slideVariant == NoSlides + then blocks + else prepSlides slideLevel blocks + toc <- if writerTableOfContents opts && slideVariant /= S5Slides + then tableOfContents opts sects + else return Nothing + blocks' <- liftM (mconcat . intersperse (nl opts)) $ + mapM (elementToHtml slideLevel opts) sects + st <- get + notes <- footnoteSection opts (reverse (stNotes st)) + let thebody = blocks' >> notes + let math = case writerHTMLMathMethod opts of + LaTeXMathML (Just url) -> + H.script ! A.src (toValue url) + ! A.type_ "text/javascript" + $ mempty + MathJax url -> + H.script ! A.src (toValue url) + ! A.type_ "text/javascript" + $ case slideVariant of + SlideousSlides -> + preEscapedString + "MathJax.Hub.Queue([\"Typeset\",MathJax.Hub]);" + _ -> mempty + JsMath (Just url) -> + H.script ! A.src (toValue url) + ! A.type_ "text/javascript" + $ mempty + KaTeX js css -> + (H.script ! A.src (toValue js) $ mempty) <> + (H.link ! A.rel "stylesheet" ! A.href (toValue css)) <> + (H.script ! A.type_ "text/javascript" $ toHtml renderKaTeX) + _ -> case lookup "mathml-script" (writerVariables opts) of + Just s | not (stHtml5 st) -> + H.script ! A.type_ "text/javascript" + $ preEscapedString + ("/*<![CDATA[*/\n" ++ s ++ "/*]]>*/\n") + | otherwise -> mempty + Nothing -> mempty + let context = (if stHighlighting st + then case writerHighlightStyle opts of + Just sty -> defField "highlighting-css" + (styleToCss sty) + Nothing -> id + else id) $ + (if stMath st + then defField "math" (renderHtml math) + else id) $ + defField "quotes" (stQuotes st) $ + maybe id (defField "toc" . renderHtml) toc $ + defField "author-meta" authsMeta $ + maybe id (defField "date-meta") (normalizeDate dateMeta) $ + defField "pagetitle" (stringifyHTML $ docTitle meta) $ + defField "idprefix" (writerIdentifierPrefix opts) $ + -- these should maybe be set in pandoc.hs + defField "slidy-url" + ("http://www.w3.org/Talks/Tools/Slidy2" :: String) $ + defField "slideous-url" ("slideous" :: String) $ + defField "revealjs-url" ("reveal.js" :: String) $ + defField "s5-url" ("s5/default" :: String) $ + defField "html5" (stHtml5 st) $ + metadata + return (thebody, context) + +-- | Like Text.XHtml's identifier, but adds the writerIdentifierPrefix +prefixedId :: WriterOptions -> String -> Attribute +prefixedId opts s = + case s of + "" -> mempty + _ -> A.id $ toValue $ writerIdentifierPrefix opts ++ s + +toList :: PandocMonad m + => (Html -> Html) + -> WriterOptions + -> [Html] + -> StateT WriterState m Html +toList listop opts items = do + slideVariant <- gets stSlideVariant + return $ + if (writerIncremental opts) + then if (slideVariant /= RevealJsSlides) + then (listop $ mconcat items) ! A.class_ "incremental" + else listop $ mconcat $ map (! A.class_ "fragment") items + else listop $ mconcat items + +unordList :: PandocMonad m + => WriterOptions -> [Html] -> StateT WriterState m Html +unordList opts = toList H.ul opts . toListItems opts + +ordList :: PandocMonad m + => WriterOptions -> [Html] -> StateT WriterState m Html +ordList opts = toList H.ol opts . toListItems opts + +defList :: PandocMonad m + => WriterOptions -> [Html] -> StateT WriterState m Html +defList opts items = toList H.dl opts (items ++ [nl opts]) + +-- | Construct table of contents from list of elements. +tableOfContents :: PandocMonad m => WriterOptions -> [Element] -> StateT WriterState m (Maybe Html) +tableOfContents _ [] = return Nothing +tableOfContents opts sects = do + contents <- mapM (elementToListItem opts) sects + let tocList = catMaybes contents + if null tocList + then return Nothing + else Just <$> unordList opts tocList + +-- | Convert section number to string +showSecNum :: [Int] -> String +showSecNum = concat . intersperse "." . map show + +-- | Converts an Element to a list item for a table of contents, +-- retrieving the appropriate identifier from state. +elementToListItem :: PandocMonad m => WriterOptions -> Element -> StateT WriterState m (Maybe Html) +-- Don't include the empty headers created in slide shows +-- shows when an hrule is used to separate slides without a new title: +elementToListItem _ (Sec _ _ _ [Str "\0"] _) = return Nothing +elementToListItem opts (Sec lev num (id',classes,_) headerText subsecs) + | lev <= writerTOCDepth opts = do + let num' = zipWith (+) num (writerNumberOffset opts ++ repeat 0) + let sectnum = if writerNumberSections opts && not (null num) && + "unnumbered" `notElem` classes + then (H.span ! A.class_ "toc-section-number" + $ toHtml $ showSecNum num') >> preEscapedString " " + else mempty + txt <- liftM (sectnum >>) $ inlineListToHtml opts $ walk deNote headerText + subHeads <- mapM (elementToListItem opts) subsecs >>= return . catMaybes + subList <- if null subHeads + then return mempty + else unordList opts subHeads + -- in reveal.js, we need #/apples, not #apples: + slideVariant <- gets stSlideVariant + let revealSlash = ['/' | slideVariant== RevealJsSlides] + return $ Just + $ if null id' + then (H.a $ toHtml txt) >> subList + else (H.a ! A.href (toValue $ "#" ++ revealSlash ++ + writerIdentifierPrefix opts ++ id') + $ toHtml txt) >> subList +elementToListItem _ _ = return Nothing + +-- | Convert an Element to Html. +elementToHtml :: PandocMonad m => Int -> WriterOptions -> Element -> StateT WriterState m Html +elementToHtml _slideLevel opts (Blk block) = blockToHtml opts block +elementToHtml slideLevel opts (Sec level num (id',classes,keyvals) title' elements) = do + slideVariant <- gets stSlideVariant + let slide = slideVariant /= NoSlides && level <= slideLevel + let num' = zipWith (+) num (writerNumberOffset opts ++ repeat 0) + modify $ \st -> st{stSecNum = num'} -- update section number + html5 <- gets stHtml5 + let titleSlide = slide && level < slideLevel + header' <- if title' == [Str "\0"] -- marker for hrule + then return mempty + else do + modify (\st -> st{ stElement = True}) + res <- blockToHtml opts + (Header level (id',classes,keyvals) title') + modify (\st -> st{ stElement = False}) + return res + + let isSec (Sec _ _ _ _ _) = True + isSec (Blk _) = False + let isPause (Blk x) = x == Para [Str ".",Space,Str ".",Space,Str "."] + isPause _ = False + let fragmentClass = case slideVariant of + RevealJsSlides -> "fragment" + _ -> "incremental" + let inDiv xs = Blk (RawBlock (Format "html") ("<div class=\"" + ++ fragmentClass ++ "\">")) : + (xs ++ [Blk (RawBlock (Format "html") "</div>")]) + innerContents <- mapM (elementToHtml slideLevel opts) + $ if titleSlide + -- title slides have no content of their own + then filter isSec elements + else case splitBy isPause elements of + [] -> [] + (x:xs) -> x ++ concatMap inDiv xs + let inNl x = mconcat $ nl opts : intersperse (nl opts) x ++ [nl opts] + let classes' = ["titleslide" | titleSlide] ++ ["slide" | slide] ++ + ["section" | (slide || writerSectionDivs opts) && + not html5 ] ++ + ["level" ++ show level | slide || writerSectionDivs opts ] + ++ classes + let secttag = if html5 + then H5.section + else H.div + let attr = (id',classes',keyvals) + return $ if titleSlide + then (if slideVariant == RevealJsSlides + then H5.section + else id) $ mconcat $ + (addAttrs opts attr $ secttag $ header') : innerContents + else if writerSectionDivs opts || slide + then addAttrs opts attr + $ secttag $ inNl $ header' : innerContents + else mconcat $ intersperse (nl opts) + $ addAttrs opts attr header' : innerContents + +-- | Convert list of Note blocks to a footnote <div>. +-- Assumes notes are sorted. +footnoteSection :: PandocMonad m + => WriterOptions -> [Html] -> StateT WriterState m Html +footnoteSection opts notes = do + html5 <- gets stHtml5 + slideVariant <- gets stSlideVariant + let hrtag = if html5 then H5.hr else H.hr + let container x = if html5 + then H5.section ! A.class_ "footnotes" $ x + else if slideVariant /= NoSlides + then H.div ! A.class_ "footnotes slide" $ x + else H.div ! A.class_ "footnotes" $ x + return $ + if null notes + then mempty + else nl opts >> (container + $ nl opts >> hrtag >> nl opts >> + H.ol (mconcat notes >> nl opts) >> nl opts) + +-- | Parse a mailto link; return Just (name, domain) or Nothing. +parseMailto :: String -> Maybe (String, String) +parseMailto s = do + case break (==':') s of + (xs,':':addr) | map toLower xs == "mailto" -> do + let (name', rest) = span (/='@') addr + let domain = drop 1 rest + return (name', domain) + _ -> fail "not a mailto: URL" + +-- | Obfuscate a "mailto:" link. +obfuscateLink :: PandocMonad m => WriterOptions -> Attr -> Html -> String -> m Html +obfuscateLink opts attr txt s | writerEmailObfuscation opts == NoObfuscation = + return $ addAttrs opts attr $ H.a ! A.href (toValue s) $ txt +obfuscateLink opts attr (renderHtml -> txt) s = + let meth = writerEmailObfuscation opts + s' = map toLower (take 7 s) ++ drop 7 s + in case parseMailto s' of + (Just (name', domain)) -> + let domain' = substitute "." " dot " domain + at' = obfuscateChar '@' + (linkText, altText) = + if txt == drop 7 s' -- autolink + then ("e", name' ++ " at " ++ domain') + else ("'" ++ obfuscateString txt ++ "'", + txt ++ " (" ++ name' ++ " at " ++ domain' ++ ")") + in case meth of + ReferenceObfuscation -> + -- need to use preEscapedString or &'s are escaped to & in URL + return $ + preEscapedString $ "<a href=\"" ++ (obfuscateString s') + ++ "\" class=\"email\">" ++ (obfuscateString txt) ++ "</a>" + JavascriptObfuscation -> + return $ + (H.script ! A.type_ "text/javascript" $ + preEscapedString ("\n<!--\nh='" ++ + obfuscateString domain ++ "';a='" ++ at' ++ "';n='" ++ + obfuscateString name' ++ "';e=n+a+h;\n" ++ + "document.write('<a h'+'ref'+'=\"ma'+'ilto'+':'+e+'\" clas'+'s=\"em' + 'ail\">'+" ++ + linkText ++ "+'<\\/'+'a'+'>');\n// -->\n")) >> + H.noscript (preEscapedString $ obfuscateString altText) + _ -> throwError $ PandocSomeError $ "Unknown obfuscation method: " ++ show meth + _ -> return $ addAttrs opts attr $ H.a ! A.href (toValue s) $ toHtml txt -- malformed email + +-- | Obfuscate character as entity. +obfuscateChar :: Char -> String +obfuscateChar char = + let num = ord char + numstr = if even num then show num else "x" ++ showHex num "" + in "&#" ++ numstr ++ ";" + +-- | Obfuscate string using entities. +obfuscateString :: String -> String +obfuscateString = concatMap obfuscateChar . fromEntities + +addAttrs :: WriterOptions -> Attr -> Html -> Html +addAttrs opts attr h = foldl (!) h (attrsToHtml opts attr) + +toAttrs :: [(String, String)] -> [Attribute] +toAttrs kvs = map (\(x,y) -> customAttribute (fromString x) (toValue y)) kvs + +attrsToHtml :: WriterOptions -> Attr -> [Attribute] +attrsToHtml opts (id',classes',keyvals) = + [prefixedId opts id' | not (null id')] ++ + [A.class_ (toValue $ unwords classes') | not (null classes')] ++ toAttrs keyvals + +imgAttrsToHtml :: WriterOptions -> Attr -> [Attribute] +imgAttrsToHtml opts attr = + attrsToHtml opts (ident,cls,kvs') ++ + toAttrs (dimensionsToAttrList opts attr) + where + (ident,cls,kvs) = attr + kvs' = filter isNotDim kvs + isNotDim ("width", _) = False + isNotDim ("height", _) = False + isNotDim _ = True + +dimensionsToAttrList :: WriterOptions -> Attr -> [(String, String)] +dimensionsToAttrList opts attr = (go Width) ++ (go Height) + where + go dir = case (dimension dir attr) of + (Just (Percent a)) -> [("style", show dir ++ ":" ++ show (Percent a))] + (Just dim) -> [(show dir, showInPixel opts dim)] + _ -> [] + + +imageExts :: [String] +imageExts = [ "art", "bmp", "cdr", "cdt", "cpt", "cr2", "crw", "djvu", "erf", + "gif", "ico", "ief", "jng", "jpg", "jpeg", "nef", "orf", "pat", "pbm", + "pcx", "pgm", "png", "pnm", "ppm", "psd", "ras", "rgb", "svg", "tiff", + "wbmp", "xbm", "xpm", "xwd" ] + +treatAsImage :: FilePath -> Bool +treatAsImage fp = + let path = case uriPath `fmap` parseURIReference fp of + Nothing -> fp + Just up -> up + ext = map toLower $ drop 1 $ takeExtension path + in null ext || ext `elem` imageExts + +-- | Convert Pandoc block element to HTML. +blockToHtml :: PandocMonad m => WriterOptions -> Block -> StateT WriterState m Html +blockToHtml _ Null = return mempty +blockToHtml opts (Plain lst) = inlineListToHtml opts lst +-- title beginning with fig: indicates that the image is a figure +blockToHtml opts (Para [Image attr txt (s,'f':'i':'g':':':tit)]) = do + img <- inlineToHtml opts (Image attr txt (s,tit)) + html5 <- gets stHtml5 + let tocapt = if html5 + then H5.figcaption + else H.p ! A.class_ "caption" + capt <- if null txt + then return mempty + else tocapt `fmap` inlineListToHtml opts txt + return $ if html5 + then H5.figure $ mconcat + [nl opts, img, capt, nl opts] + else H.div ! A.class_ "figure" $ mconcat + [nl opts, img, nl opts, capt, nl opts] +blockToHtml opts (Para lst) + | isEmptyRaw lst = return mempty + | otherwise = do + contents <- inlineListToHtml opts lst + return $ H.p contents + where + isEmptyRaw [RawInline f _] = f /= (Format "html") + isEmptyRaw _ = False +blockToHtml opts (LineBlock lns) = + if writerWrapText opts == WrapNone + then blockToHtml opts $ linesToPara lns + else do + let lf = preEscapedString "\n" + htmlLines <- mconcat . intersperse lf <$> mapM (inlineListToHtml opts) lns + return $ H.div ! A.style "white-space: pre-line;" $ htmlLines +blockToHtml opts (Div attr@(ident, classes, kvs) bs) = do + html5 <- gets stHtml5 + let speakerNotes = "notes" `elem` classes + -- we don't want incremental output inside speaker notes, see #1394 + let opts' = if speakerNotes then opts{ writerIncremental = False } else opts + contents <- blockListToHtml opts' bs + let contents' = nl opts >> contents >> nl opts + let (divtag, classes') = if html5 && "section" `elem` classes + then (H5.section, filter (/= "section") classes) + else (H.div, classes) + slideVariant <- gets stSlideVariant + return $ + if speakerNotes + then case slideVariant of + RevealJsSlides -> addAttrs opts' attr $ H5.aside $ contents' + DZSlides -> (addAttrs opts' attr $ H5.div $ contents') + ! (H5.customAttribute "role" "note") + NoSlides -> addAttrs opts' attr $ H.div $ contents' + _ -> mempty + else addAttrs opts (ident, classes', kvs) $ divtag $ contents' +blockToHtml opts (RawBlock f str) + | f == Format "html" = return $ preEscapedString str + | (f == Format "latex" || f == Format "tex") && + allowsMathEnvironments (writerHTMLMathMethod opts) && + isMathEnvironment str = blockToHtml opts $ Plain [Math DisplayMath str] + | otherwise = do + report $ BlockNotRendered (RawBlock f str) + return mempty +blockToHtml _ (HorizontalRule) = do + html5 <- gets stHtml5 + return $ if html5 then H5.hr else H.hr +blockToHtml opts (CodeBlock (id',classes,keyvals) rawCode) = do + let tolhs = isEnabled Ext_literate_haskell opts && + any (\c -> map toLower c == "haskell") classes && + any (\c -> map toLower c == "literate") classes + classes' = if tolhs + then map (\c -> if map toLower c == "haskell" + then "literatehaskell" + else c) classes + else classes + adjCode = if tolhs + then unlines . map ("> " ++) . lines $ rawCode + else rawCode + hlCode = if isJust (writerHighlightStyle opts) + then highlight formatHtmlBlock + (id',classes',keyvals) adjCode + else Nothing + case hlCode of + Nothing -> return $ addAttrs opts (id',classes,keyvals) + $ H.pre $ H.code $ toHtml adjCode + Just h -> modify (\st -> st{ stHighlighting = True }) >> + return (addAttrs opts (id',[],keyvals) h) +blockToHtml opts (BlockQuote blocks) = do + -- in S5, treat list in blockquote specially + -- if default is incremental, make it nonincremental; + -- otherwise incremental + slideVariant <- gets stSlideVariant + if slideVariant /= NoSlides + then let inc = not (writerIncremental opts) in + case blocks of + [BulletList lst] -> blockToHtml (opts {writerIncremental = inc}) + (BulletList lst) + [OrderedList attribs lst] -> + blockToHtml (opts {writerIncremental = inc}) + (OrderedList attribs lst) + [DefinitionList lst] -> + blockToHtml (opts {writerIncremental = inc}) + (DefinitionList lst) + _ -> do contents <- blockListToHtml opts blocks + return $ H.blockquote + $ nl opts >> contents >> nl opts + else do + contents <- blockListToHtml opts blocks + return $ H.blockquote $ nl opts >> contents >> nl opts +blockToHtml opts (Header level attr@(_,classes,_) lst) = do + contents <- inlineListToHtml opts lst + secnum <- liftM stSecNum get + let contents' = if writerNumberSections opts && not (null secnum) + && "unnumbered" `notElem` classes + then (H.span ! A.class_ "header-section-number" $ toHtml + $ showSecNum secnum) >> strToHtml " " >> contents + else contents + inElement <- gets stElement + return $ (if inElement then id else addAttrs opts attr) + $ case level of + 1 -> H.h1 contents' + 2 -> H.h2 contents' + 3 -> H.h3 contents' + 4 -> H.h4 contents' + 5 -> H.h5 contents' + 6 -> H.h6 contents' + _ -> H.p contents' +blockToHtml opts (BulletList lst) = do + contents <- mapM (blockListToHtml opts) lst + unordList opts contents +blockToHtml opts (OrderedList (startnum, numstyle, _) lst) = do + contents <- mapM (blockListToHtml opts) lst + html5 <- gets stHtml5 + let numstyle' = case numstyle of + Example -> "decimal" + _ -> camelCaseToHyphenated $ show numstyle + let attribs = (if startnum /= 1 + then [A.start $ toValue startnum] + else []) ++ + (if numstyle == Example + then [A.class_ "example"] + else []) ++ + (if numstyle /= DefaultStyle + then if html5 + then [A.type_ $ + case numstyle of + Decimal -> "1" + LowerAlpha -> "a" + UpperAlpha -> "A" + LowerRoman -> "i" + UpperRoman -> "I" + _ -> "1"] + else [A.style $ toValue $ "list-style-type: " ++ + numstyle'] + else []) + l <- ordList opts contents + return $ foldl (!) l attribs +blockToHtml opts (DefinitionList lst) = do + contents <- mapM (\(term, defs) -> + do term' <- if null term + then return mempty + else liftM H.dt $ inlineListToHtml opts term + defs' <- mapM ((liftM (\x -> H.dd $ (x >> nl opts))) . + blockListToHtml opts) defs + return $ mconcat $ nl opts : term' : nl opts : + intersperse (nl opts) defs') lst + defList opts contents +blockToHtml opts (Table capt aligns widths headers rows') = do + captionDoc <- if null capt + then return mempty + else do + cs <- inlineListToHtml opts capt + return $ H.caption cs >> nl opts + html5 <- gets stHtml5 + let percent w = show (truncate (100*w) :: Integer) ++ "%" + let coltags = if all (== 0.0) widths + then mempty + else do + H.colgroup $ do + nl opts + mapM_ (\w -> do + if html5 + then H.col ! A.style (toValue $ "width: " ++ + percent w) + else H.col ! A.width (toValue $ percent w) + nl opts) widths + nl opts + head' <- if all null headers + then return mempty + else do + contents <- tableRowToHtml opts aligns 0 headers + return $ H.thead (nl opts >> contents) >> nl opts + body' <- liftM (\x -> H.tbody (nl opts >> mconcat x)) $ + zipWithM (tableRowToHtml opts aligns) [1..] rows' + let tbl = H.table $ + nl opts >> captionDoc >> coltags >> head' >> body' >> nl opts + let totalWidth = sum widths + -- When widths of columns are < 100%, we need to set width for the whole + -- table, or some browsers give us skinny columns with lots of space between: + return $ if totalWidth == 0 || totalWidth == 1 + then tbl + else tbl ! A.style (toValue $ "width:" ++ + show (round (totalWidth * 100) :: Int) ++ "%;") + +tableRowToHtml :: PandocMonad m + => WriterOptions + -> [Alignment] + -> Int + -> [[Block]] + -> StateT WriterState m Html +tableRowToHtml opts aligns rownum cols' = do + let mkcell = if rownum == 0 then H.th else H.td + let rowclass = case rownum of + 0 -> "header" + x | x `rem` 2 == 1 -> "odd" + _ -> "even" + cols'' <- sequence $ zipWith + (\alignment item -> tableItemToHtml opts mkcell alignment item) + aligns cols' + return $ (H.tr ! A.class_ rowclass $ nl opts >> mconcat cols'') + >> nl opts + +alignmentToString :: Alignment -> [Char] +alignmentToString alignment = case alignment of + AlignLeft -> "left" + AlignRight -> "right" + AlignCenter -> "center" + AlignDefault -> "" + +tableItemToHtml :: PandocMonad m + => WriterOptions + -> (Html -> Html) + -> Alignment + -> [Block] + -> StateT WriterState m Html +tableItemToHtml opts tag' align' item = do + contents <- blockListToHtml opts item + html5 <- gets stHtml5 + let alignStr = alignmentToString align' + let attribs = if html5 + then A.style (toValue $ "text-align: " ++ alignStr ++ ";") + else A.align (toValue alignStr) + let tag'' = if null alignStr + then tag' + else tag' ! attribs + return $ (tag'' $ contents) >> nl opts + +toListItems :: WriterOptions -> [Html] -> [Html] +toListItems opts items = map (toListItem opts) items ++ [nl opts] + +toListItem :: WriterOptions -> Html -> Html +toListItem opts item = nl opts >> H.li item + +blockListToHtml :: PandocMonad m => WriterOptions -> [Block] -> StateT WriterState m Html +blockListToHtml opts lst = + fmap (mconcat . intersperse (nl opts)) $ mapM (blockToHtml opts) lst + +-- | Convert list of Pandoc inline elements to HTML. +inlineListToHtml :: PandocMonad m => WriterOptions -> [Inline] -> StateT WriterState m Html +inlineListToHtml opts lst = + mapM (inlineToHtml opts) lst >>= return . mconcat + +-- | Annotates a MathML expression with the tex source +annotateMML :: XML.Element -> String -> XML.Element +annotateMML e tex = math (unode "semantics" [cs, unode "annotation" (annotAttrs, tex)]) + where + cs = case elChildren e of + [] -> unode "mrow" () + [x] -> x + xs -> unode "mrow" xs + math childs = XML.Element q as [XML.Elem childs] l + where + (XML.Element q as _ l) = e + annotAttrs = [XML.Attr (unqual "encoding") "application/x-tex"] + + +-- | Convert Pandoc inline element to HTML. +inlineToHtml :: PandocMonad m + => WriterOptions -> Inline -> StateT WriterState m Html +inlineToHtml opts inline = do + html5 <- gets stHtml5 + case inline of + (Str str) -> return $ strToHtml str + (Space) -> return $ strToHtml " " + (SoftBreak) -> return $ case writerWrapText opts of + WrapNone -> preEscapedString " " + WrapAuto -> preEscapedString " " + WrapPreserve -> preEscapedString "\n" + (LineBreak) -> return $ (if html5 then H5.br else H.br) + <> strToHtml "\n" + (Span (id',classes,kvs) ils) + -> inlineListToHtml opts ils >>= + return . addAttrs opts attr' . H.span + where attr' = (id',classes',kvs') + classes' = filter (`notElem` ["csl-no-emph", + "csl-no-strong", + "csl-no-smallcaps"]) classes + kvs' = if null styles + then kvs + else (("style", concat styles) : kvs) + styles = ["font-style:normal;" + | "csl-no-emph" `elem` classes] + ++ ["font-weight:normal;" + | "csl-no-strong" `elem` classes] + ++ ["font-variant:normal;" + | "csl-no-smallcaps" `elem` classes] + (Emph lst) -> inlineListToHtml opts lst >>= return . H.em + (Strong lst) -> inlineListToHtml opts lst >>= return . H.strong + (Code attr str) -> case hlCode of + Nothing -> return + $ addAttrs opts attr + $ H.code $ strToHtml str + Just h -> do + modify $ \st -> st{ stHighlighting = True } + return $ addAttrs opts (id',[],keyvals) h + where (id',_,keyvals) = attr + hlCode = if isJust (writerHighlightStyle opts) + then highlight formatHtmlInline + attr str + else Nothing + (Strikeout lst) -> inlineListToHtml opts lst >>= + return . H.del + (SmallCaps lst) -> inlineListToHtml opts lst >>= + return . (H.span ! A.style "font-variant: small-caps;") + (Superscript lst) -> inlineListToHtml opts lst >>= return . H.sup + (Subscript lst) -> inlineListToHtml opts lst >>= return . H.sub + (Quoted quoteType lst) -> + let (leftQuote, rightQuote) = case quoteType of + SingleQuote -> (strToHtml "‘", + strToHtml "’") + DoubleQuote -> (strToHtml "“", + strToHtml "”") + in if writerHtmlQTags opts + then do + modify $ \st -> st{ stQuotes = True } + H.q `fmap` inlineListToHtml opts lst + else (\x -> leftQuote >> x >> rightQuote) + `fmap` inlineListToHtml opts lst + (Math t str) -> do + modify (\st -> st {stMath = True}) + let mathClass = toValue $ ("math " :: String) ++ + if t == InlineMath then "inline" else "display" + case writerHTMLMathMethod opts of + LaTeXMathML _ -> + -- putting LaTeXMathML in container with class "LaTeX" prevents + -- non-math elements on the page from being treated as math by + -- the javascript + return $ H.span ! A.class_ "LaTeX" $ + case t of + InlineMath -> toHtml ("$" ++ str ++ "$") + DisplayMath -> toHtml ("$$" ++ str ++ "$$") + JsMath _ -> do + let m = preEscapedString str + return $ case t of + InlineMath -> H.span ! A.class_ mathClass $ m + DisplayMath -> H.div ! A.class_ mathClass $ m + WebTeX url -> do + let imtag = if html5 then H5.img else H.img + let m = imtag ! A.style "vertical-align:middle" + ! A.src (toValue $ url ++ urlEncode str) + ! A.alt (toValue str) + ! A.title (toValue str) + let brtag = if html5 then H5.br else H.br + return $ case t of + InlineMath -> m + DisplayMath -> brtag >> m >> brtag + GladTeX -> + return $ case t of + InlineMath -> preEscapedString $ "<EQ ENV=\"math\">" ++ str ++ "</EQ>" + DisplayMath -> preEscapedString $ "<EQ ENV=\"displaymath\">" ++ str ++ "</EQ>" + MathML -> do + let conf = useShortEmptyTags (const False) + defaultConfigPP + res <- lift $ convertMath writeMathML t str + case res of + Right r -> return $ preEscapedString $ + ppcElement conf (annotateMML r str) + Left il -> (H.span ! A.class_ mathClass) <$> + inlineToHtml opts il + MathJax _ -> return $ H.span ! A.class_ mathClass $ toHtml $ + case t of + InlineMath -> "\\(" ++ str ++ "\\)" + DisplayMath -> "\\[" ++ str ++ "\\]" + KaTeX _ _ -> return $ H.span ! A.class_ mathClass $ + toHtml (case t of + InlineMath -> str + DisplayMath -> "\\displaystyle " ++ str) + PlainMath -> do + x <- lift (texMathToInlines t str) >>= inlineListToHtml opts + let m = H.span ! A.class_ mathClass $ x + let brtag = if html5 then H5.br else H.br + return $ case t of + InlineMath -> m + DisplayMath -> brtag >> m >> brtag + (RawInline f str) + | f == Format "html" -> return $ preEscapedString str + | otherwise -> do + report $ InlineNotRendered inline + return mempty + (Link attr txt (s,_)) | "mailto:" `isPrefixOf` s -> do + linkText <- inlineListToHtml opts txt + lift $ obfuscateLink opts attr linkText s + (Link attr txt (s,tit)) -> do + linkText <- inlineListToHtml opts txt + slideVariant <- gets stSlideVariant + let s' = case s of + '#':xs -> let prefix = if slideVariant == RevealJsSlides + then "/" + else writerIdentifierPrefix opts + in '#' : prefix ++ xs + _ -> s + let link = H.a ! A.href (toValue s') $ linkText + let link' = if txt == [Str (unEscapeString s)] + then link ! A.class_ "uri" + else link + let link'' = addAttrs opts attr link' + return $ if null tit + then link'' + else link'' ! A.title (toValue tit) + (Image attr txt (s,tit)) | treatAsImage s -> do + let alternate' = stringify txt + let attributes = [A.src $ toValue s] ++ + [A.title $ toValue tit | not (null tit)] ++ + [A.alt $ toValue alternate' | not (null txt)] ++ + imgAttrsToHtml opts attr + let tag = if html5 then H5.img else H.img + return $ foldl (!) tag attributes + -- note: null title included, as in Markdown.pl + (Image attr _ (s,tit)) -> do + let attributes = [A.src $ toValue s] ++ + [A.title $ toValue tit | not (null tit)] ++ + imgAttrsToHtml opts attr + return $ foldl (!) H5.embed attributes + -- note: null title included, as in Markdown.pl + (Note contents) -> do + notes <- gets stNotes + let number = (length notes) + 1 + let ref = show number + htmlContents <- blockListToNote opts ref contents + epubVersion <- gets stEPUBVersion + -- push contents onto front of notes + modify $ \st -> st {stNotes = (htmlContents:notes)} + slideVariant <- gets stSlideVariant + let revealSlash = ['/' | slideVariant == RevealJsSlides] + let link = H.a ! A.href (toValue $ "#" ++ + revealSlash ++ + writerIdentifierPrefix opts ++ "fn" ++ ref) + ! A.class_ "footnoteRef" + ! prefixedId opts ("fnref" ++ ref) + $ (if isJust epubVersion + then id + else H.sup) + $ toHtml ref + return $ case epubVersion of + Just EPUB3 -> link ! customAttribute "epub:type" "noteref" + _ -> link + (Cite cits il)-> do contents <- inlineListToHtml opts il + let citationIds = unwords $ map citationId cits + let result = H.span ! A.class_ "citation" $ contents + return $ if html5 + then result ! customAttribute "data-cites" (toValue citationIds) + else result + +blockListToNote :: PandocMonad m => WriterOptions -> String -> [Block] -> StateT WriterState m Html +blockListToNote opts ref blocks = + -- If last block is Para or Plain, include the backlink at the end of + -- that block. Otherwise, insert a new Plain block with the backlink. + let backlink = [Link nullAttr [Str "↩"] ("#" ++ writerIdentifierPrefix opts ++ "fnref" ++ ref,[])] + blocks' = if null blocks + then [] + else let lastBlock = last blocks + otherBlocks = init blocks + in case lastBlock of + (Para lst) -> otherBlocks ++ + [Para (lst ++ backlink)] + (Plain lst) -> otherBlocks ++ + [Plain (lst ++ backlink)] + _ -> otherBlocks ++ [lastBlock, + Plain backlink] + in do contents <- blockListToHtml opts blocks' + let noteItem = H.li ! (prefixedId opts ("fn" ++ ref)) $ contents + epubVersion <- gets stEPUBVersion + let noteItem' = case epubVersion of + Just EPUB3 -> noteItem ! customAttribute "epub:type" "footnote" + _ -> noteItem + return $ nl opts >> noteItem' + +-- Javascript snippet to render all KaTeX elements +renderKaTeX :: String +renderKaTeX = unlines [ + "window.onload = function(){var mathElements = document.getElementsByClassName(\"math\");" + , "for (var i=0; i < mathElements.length; i++)" + , "{" + , " var texText = mathElements[i].firstChild" + , " katex.render(texText.data, mathElements[i])" + , "}}" + ] + +isMathEnvironment :: String -> Bool +isMathEnvironment s = "\\begin{" `isPrefixOf` s && + envName `elem` mathmlenvs + where envName = takeWhile (/= '}') (drop 7 s) + mathmlenvs = [ "align" + , "align*" + , "alignat" + , "alignat*" + , "aligned" + , "alignedat" + , "array" + , "Bmatrix" + , "bmatrix" + , "cases" + , "CD" + , "eqnarray" + , "eqnarray*" + , "equation" + , "equation*" + , "gather" + , "gather*" + , "gathered" + , "matrix" + , "multline" + , "multline*" + , "pmatrix" + , "smallmatrix" + , "split" + , "subarray" + , "Vmatrix" + , "vmatrix" ] + +allowsMathEnvironments :: HTMLMathMethod -> Bool +allowsMathEnvironments (MathJax _) = True +allowsMathEnvironments (MathML) = True +allowsMathEnvironments (WebTeX _) = True +allowsMathEnvironments _ = False diff --git a/src/Text/Pandoc/Writers/Haddock.hs b/src/Text/Pandoc/Writers/Haddock.hs new file mode 100644 index 000000000..945e4a0f1 --- /dev/null +++ b/src/Text/Pandoc/Writers/Haddock.hs @@ -0,0 +1,370 @@ +{-# LANGUAGE OverloadedStrings, TupleSections, ScopedTypeVariables #-} +{- +Copyright (C) 2014 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Haddock + Copyright : Copyright (C) 2014 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to haddock markup. + +Haddock: <http://www.haskell.org/haddock/doc/html/> +-} +module Text.Pandoc.Writers.Haddock (writeHaddock) where +import Text.Pandoc.Definition +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import Data.List ( intersperse, transpose ) +import Text.Pandoc.Pretty +import Control.Monad.State +import Text.Pandoc.Writers.Math (texMathToInlines) +import Network.URI (isURI) +import Data.Default +import Text.Pandoc.Class (PandocMonad, report) +import Text.Pandoc.Logging + +type Notes = [[Block]] +data WriterState = WriterState { stNotes :: Notes } +instance Default WriterState + where def = WriterState{ stNotes = [] } + +-- | Convert Pandoc to Haddock. +writeHaddock :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeHaddock opts document = + evalStateT (pandocToHaddock opts{ + writerWrapText = writerWrapText opts } document) def + +-- | Return haddock representation of document. +pandocToHaddock :: PandocMonad m + => WriterOptions -> Pandoc -> StateT WriterState m String +pandocToHaddock opts (Pandoc meta blocks) = do + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + body <- blockListToHaddock opts blocks + st <- get + notes' <- notesToHaddock opts (reverse $ stNotes st) + let render' :: Doc -> String + render' = render colwidth + let main = render' $ body <> + (if isEmpty notes' then empty else blankline <> notes') + metadata <- metaToJSON opts + (fmap (render colwidth) . blockListToHaddock opts) + (fmap (render colwidth) . inlineListToHaddock opts) + meta + let context = defField "body" main + $ metadata + case writerTemplate opts of + Nothing -> return main + Just tpl -> return $ renderTemplate' tpl context + +-- | Return haddock representation of notes. +notesToHaddock :: PandocMonad m + => WriterOptions -> [[Block]] -> StateT WriterState m Doc +notesToHaddock opts notes = + if null notes + then return empty + else do + contents <- blockToHaddock opts $ OrderedList (1,DefaultStyle,DefaultDelim) notes + return $ text "#notes#" <> blankline <> contents + +-- | Escape special characters for Haddock. +escapeString :: String -> String +escapeString = escapeStringUsing haddockEscapes + where haddockEscapes = backslashEscapes "\\/'`\"@<" + +-- | Convert Pandoc block element to haddock. +blockToHaddock :: PandocMonad m + => WriterOptions -- ^ Options + -> Block -- ^ Block element + -> StateT WriterState m Doc +blockToHaddock _ Null = return empty +blockToHaddock opts (Div _ ils) = do + contents <- blockListToHaddock opts ils + return $ contents <> blankline +blockToHaddock opts (Plain inlines) = do + contents <- inlineListToHaddock opts inlines + return $ contents <> cr +-- title beginning with fig: indicates figure +blockToHaddock opts (Para [Image attr alt (src,'f':'i':'g':':':tit)]) = + blockToHaddock opts (Para [Image attr alt (src,tit)]) +blockToHaddock opts (Para inlines) = + -- TODO: if it contains linebreaks, we need to use a @...@ block + (<> blankline) `fmap` blockToHaddock opts (Plain inlines) +blockToHaddock opts (LineBlock lns) = + blockToHaddock opts $ linesToPara lns +blockToHaddock _ b@(RawBlock f str) + | f == "haddock" = do + return $ text str <> text "\n" + | otherwise = do + report $ BlockNotRendered b + return empty +blockToHaddock opts HorizontalRule = + return $ blankline <> text (replicate (writerColumns opts) '_') <> blankline +blockToHaddock opts (Header level (ident,_,_) inlines) = do + contents <- inlineListToHaddock opts inlines + let attr' = if null ident + then empty + else cr <> text "#" <> text ident <> text "#" + return $ nowrap (text (replicate level '=') <> space <> contents) + <> attr' <> blankline +blockToHaddock _ (CodeBlock (_,_,_) str) = + return $ prefixed "> " (text str) <> blankline +-- Nothing in haddock corresponds to block quotes: +blockToHaddock opts (BlockQuote blocks) = + blockListToHaddock opts blocks +-- Haddock doesn't have tables. Use haddock tables in code. +blockToHaddock opts (Table caption aligns widths headers rows) = do + caption' <- inlineListToHaddock opts caption + let caption'' = if null caption + then empty + else blankline <> caption' <> blankline + rawHeaders <- mapM (blockListToHaddock opts) headers + rawRows <- mapM (mapM (blockListToHaddock opts)) rows + let isSimple = all (==0) widths + let isPlainBlock (Plain _) = True + isPlainBlock _ = False + let hasBlocks = not (all isPlainBlock $ concat . concat $ headers:rows) + (nst,tbl) <- case True of + _ | isSimple -> fmap (nest 2,) $ + pandocTable opts (all null headers) aligns widths + rawHeaders rawRows + | not hasBlocks -> fmap (nest 2,) $ + pandocTable opts (all null headers) aligns widths + rawHeaders rawRows + | otherwise -> fmap (id,) $ + gridTable opts (all null headers) aligns widths + rawHeaders rawRows + return $ (prefixed "> " $ nst $ tbl $$ blankline $$ caption'') $$ blankline +blockToHaddock opts (BulletList items) = do + contents <- mapM (bulletListItemToHaddock opts) items + return $ cat contents <> blankline +blockToHaddock opts (OrderedList (start,_,delim) items) = do + let attribs = (start, Decimal, delim) + let markers = orderedListMarkers attribs + let markers' = map (\m -> if length m < 3 + then m ++ replicate (3 - length m) ' ' + else m) markers + contents <- mapM (\(item, num) -> orderedListItemToHaddock opts item num) $ + zip markers' items + return $ cat contents <> blankline +blockToHaddock opts (DefinitionList items) = do + contents <- mapM (definitionListItemToHaddock opts) items + return $ cat contents <> blankline + +pandocTable :: PandocMonad m + => WriterOptions -> Bool -> [Alignment] -> [Double] + -> [Doc] -> [[Doc]] -> StateT WriterState m Doc +pandocTable opts headless aligns widths rawHeaders rawRows = do + let isSimple = all (==0) widths + let alignHeader alignment = case alignment of + AlignLeft -> lblock + AlignCenter -> cblock + AlignRight -> rblock + AlignDefault -> lblock + let numChars = maximum . map offset + let widthsInChars = if isSimple + then map ((+2) . numChars) + $ transpose (rawHeaders : rawRows) + else map + (floor . (fromIntegral (writerColumns opts) *)) + widths + let makeRow = hcat . intersperse (lblock 1 (text " ")) . + (zipWith3 alignHeader aligns widthsInChars) + let rows' = map makeRow rawRows + let head' = makeRow rawHeaders + let maxRowHeight = maximum $ map height (head':rows') + let underline = cat $ intersperse (text " ") $ + map (\width -> text (replicate width '-')) widthsInChars + let border = if maxRowHeight > 1 + then text (replicate (sum widthsInChars + + length widthsInChars - 1) '-') + else if headless + then underline + else empty + let head'' = if headless + then empty + else border <> cr <> head' + let body = if maxRowHeight > 1 + then vsep rows' + else vcat rows' + let bottom = if headless + then underline + else border + return $ head'' $$ underline $$ body $$ bottom + +gridTable :: PandocMonad m + => WriterOptions -> Bool -> [Alignment] -> [Double] + -> [Doc] -> [[Doc]] -> StateT WriterState m Doc +gridTable opts headless _aligns widths headers' rawRows = do + let numcols = length headers' + let widths' = if all (==0) widths + then replicate numcols (1.0 / fromIntegral numcols) + else widths + let widthsInChars = map (floor . (fromIntegral (writerColumns opts) *)) widths' + let hpipeBlocks blocks = hcat [beg, middle, end] + where h = maximum (map height blocks) + sep' = lblock 3 $ vcat (map text $ replicate h " | ") + beg = lblock 2 $ vcat (map text $ replicate h "| ") + end = lblock 2 $ vcat (map text $ replicate h " |") + middle = chomp $ hcat $ intersperse sep' blocks + let makeRow = hpipeBlocks . zipWith lblock widthsInChars + let head' = makeRow headers' + let rows' = map (makeRow . map chomp) rawRows + let border ch = char '+' <> char ch <> + (hcat $ intersperse (char ch <> char '+' <> char ch) $ + map (\l -> text $ replicate l ch) widthsInChars) <> + char ch <> char '+' + let body = vcat $ intersperse (border '-') rows' + let head'' = if headless + then empty + else head' $$ border '=' + return $ border '-' $$ head'' $$ body $$ border '-' + +-- | Convert bullet list item (list of blocks) to haddock +bulletListItemToHaddock :: PandocMonad m + => WriterOptions -> [Block] -> StateT WriterState m Doc +bulletListItemToHaddock opts items = do + contents <- blockListToHaddock opts items + let sps = replicate (writerTabStop opts - 2) ' ' + let start = text ('-' : ' ' : sps) + -- remove trailing blank line if it is a tight list + let contents' = case reverse items of + (BulletList xs:_) | isTightList xs -> + chomp contents <> cr + (OrderedList _ xs:_) | isTightList xs -> + chomp contents <> cr + _ -> contents + return $ hang (writerTabStop opts) start $ contents' <> cr + +-- | Convert ordered list item (a list of blocks) to haddock +orderedListItemToHaddock :: PandocMonad m + => WriterOptions -- ^ options + -> String -- ^ list item marker + -> [Block] -- ^ list item (list of blocks) + -> StateT WriterState m Doc +orderedListItemToHaddock opts marker items = do + contents <- blockListToHaddock opts items + let sps = case length marker - writerTabStop opts of + n | n > 0 -> text $ replicate n ' ' + _ -> text " " + let start = text marker <> sps + return $ hang (writerTabStop opts) start $ contents <> cr + +-- | Convert definition list item (label, list of blocks) to haddock +definitionListItemToHaddock :: PandocMonad m + => WriterOptions + -> ([Inline],[[Block]]) + -> StateT WriterState m Doc +definitionListItemToHaddock opts (label, defs) = do + labelText <- inlineListToHaddock opts label + defs' <- mapM (mapM (blockToHaddock opts)) defs + let contents = vcat $ map (\d -> hang 4 empty $ vcat d <> cr) defs' + return $ nowrap (brackets labelText) <> cr <> contents <> cr + +-- | Convert list of Pandoc block elements to haddock +blockListToHaddock :: PandocMonad m + => WriterOptions -- ^ Options + -> [Block] -- ^ List of block elements + -> StateT WriterState m Doc +blockListToHaddock opts blocks = + mapM (blockToHaddock opts) blocks >>= return . cat + +-- | Convert list of Pandoc inline elements to haddock. +inlineListToHaddock :: PandocMonad m + => WriterOptions -> [Inline] -> StateT WriterState m Doc +inlineListToHaddock opts lst = + mapM (inlineToHaddock opts) lst >>= return . cat + +-- | Convert Pandoc inline element to haddock. +inlineToHaddock :: PandocMonad m + => WriterOptions -> Inline -> StateT WriterState m Doc +inlineToHaddock opts (Span (ident,_,_) ils) = do + contents <- inlineListToHaddock opts ils + if not (null ident) && null ils + then return $ "#" <> text ident <> "#" + else return contents +inlineToHaddock opts (Emph lst) = do + contents <- inlineListToHaddock opts lst + return $ "/" <> contents <> "/" +inlineToHaddock opts (Strong lst) = do + contents <- inlineListToHaddock opts lst + return $ "__" <> contents <> "__" +inlineToHaddock opts (Strikeout lst) = do + contents <- inlineListToHaddock opts lst + -- not supported in haddock, but we fake it: + return $ "~~" <> contents <> "~~" +-- not supported in haddock: +inlineToHaddock opts (Superscript lst) = inlineListToHaddock opts lst +-- not supported in haddock: +inlineToHaddock opts (Subscript lst) = inlineListToHaddock opts lst +-- not supported in haddock: +inlineToHaddock opts (SmallCaps lst) = inlineListToHaddock opts lst +inlineToHaddock opts (Quoted SingleQuote lst) = do + contents <- inlineListToHaddock opts lst + return $ "‘" <> contents <> "’" +inlineToHaddock opts (Quoted DoubleQuote lst) = do + contents <- inlineListToHaddock opts lst + return $ "“" <> contents <> "”" +inlineToHaddock _ (Code _ str) = + return $ "@" <> text (escapeString str) <> "@" +inlineToHaddock _ (Str str) = do + return $ text $ escapeString str +inlineToHaddock opts (Math mt str) = do + let adjust x = case mt of + DisplayMath -> cr <> x <> cr + InlineMath -> x + adjust <$> (lift (texMathToInlines mt str) >>= inlineListToHaddock opts) +inlineToHaddock _ il@(RawInline f str) + | f == "haddock" = return $ text str + | otherwise = do + report $ InlineNotRendered il + return empty +-- no line break in haddock (see above on CodeBlock) +inlineToHaddock _ LineBreak = return cr +inlineToHaddock opts SoftBreak = + case writerWrapText opts of + WrapAuto -> return space + WrapNone -> return space + WrapPreserve -> return cr +inlineToHaddock _ Space = return space +inlineToHaddock opts (Cite _ lst) = inlineListToHaddock opts lst +inlineToHaddock _ (Link _ txt (src, _)) = do + let linktext = text $ escapeString $ stringify txt + let useAuto = isURI src && + case txt of + [Str s] | escapeURI s == src -> True + _ -> False + return $ nowrap $ "<" <> text src <> + (if useAuto then empty else space <> linktext) <> ">" +inlineToHaddock opts (Image attr alternate (source, tit)) = do + linkhaddock <- inlineToHaddock opts (Link attr alternate (source, tit)) + return $ "<" <> linkhaddock <> ">" +-- haddock doesn't have notes, but we can fake it: +inlineToHaddock opts (Note contents) = do + modify (\st -> st{ stNotes = contents : stNotes st }) + st <- get + let ref = text $ writerIdentifierPrefix opts ++ show (length $ stNotes st) + return $ "<#notes [" <> ref <> "]>" diff --git a/src/Text/Pandoc/Writers/ICML.hs b/src/Text/Pandoc/Writers/ICML.hs new file mode 100644 index 000000000..efec17d26 --- /dev/null +++ b/src/Text/Pandoc/Writers/ICML.hs @@ -0,0 +1,584 @@ +{-# LANGUAGE OverloadedStrings, FlexibleContexts, ScopedTypeVariables #-} + +{- | + Module : Text.Pandoc.Writers.ICML + Copyright : Copyright (C) 2013-2016 github.com/mb21 + License : GNU GPL, version 2 or above + + Stability : alpha + +Conversion of 'Pandoc' documents to Adobe InCopy ICML, a stand-alone XML format +which is a subset of the zipped IDML format for which the documentation is +available here: http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/indesign/sdk/cs6/idml/idml-specification.pdf +InCopy is the companion word-processor to Adobe InDesign and ICML documents can be integrated +into InDesign with File -> Place. +-} +module Text.Pandoc.Writers.ICML (writeICML) where +import Text.Pandoc.Definition +import Text.Pandoc.Error (PandocError) +import Text.Pandoc.XML +import Text.Pandoc.Writers.Math (texMathToInlines) +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Shared (linesToPara, splitBy) +import Text.Pandoc.Options +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.Pretty +import Text.Pandoc.ImageSize +import Data.List (isPrefixOf, isInfixOf, stripPrefix, intersperse) +import Data.Text as Text (breakOnAll, pack) +import Control.Monad.State +import Control.Monad.Except (runExceptT) +import Network.URI (isURI) +import qualified Data.Set as Set +import Text.Pandoc.Class (PandocMonad, report) +import Text.Pandoc.Logging +import qualified Text.Pandoc.Class as P + +type Style = [String] +type Hyperlink = [(Int, String)] + +data WriterState = WriterState{ + blockStyles :: Set.Set String + , inlineStyles :: Set.Set String + , links :: Hyperlink + , listDepth :: Int + , maxListDepth :: Int + } + +type WS m = StateT WriterState m + +defaultWriterState :: WriterState +defaultWriterState = WriterState{ + blockStyles = Set.empty + , inlineStyles = Set.empty + , links = [] + , listDepth = 1 + , maxListDepth = 0 + } + +-- inline names (appear in InDesign's character styles pane) +emphName :: String +strongName :: String +strikeoutName :: String +superscriptName :: String +subscriptName :: String +smallCapsName :: String +codeName :: String +linkName :: String +emphName = "Italic" +strongName = "Bold" +strikeoutName = "Strikeout" +superscriptName = "Superscript" +subscriptName = "Subscript" +smallCapsName = "SmallCaps" +codeName = "Code" +linkName = "Link" + +-- block element names (appear in InDesign's paragraph styles pane) +paragraphName :: String +figureName :: String +imgCaptionName :: String +codeBlockName :: String +blockQuoteName :: String +orderedListName :: String +bulletListName :: String +defListTermName :: String +defListDefName :: String +headerName :: String +tableName :: String +tableHeaderName :: String +tableCaptionName :: String +alignLeftName :: String +alignRightName :: String +alignCenterName :: String +firstListItemName :: String +beginsWithName :: String +lowerRomanName :: String +upperRomanName :: String +lowerAlphaName :: String +upperAlphaName :: String +subListParName :: String +footnoteName :: String +citeName :: String +paragraphName = "Paragraph" +figureName = "Figure" +imgCaptionName = "Caption" +codeBlockName = "CodeBlock" +blockQuoteName = "Blockquote" +orderedListName = "NumList" +bulletListName = "BulList" +defListTermName = "DefListTerm" +defListDefName = "DefListDef" +headerName = "Header" +tableName = "TablePar" +tableHeaderName = "TableHeader" +tableCaptionName = "TableCaption" +alignLeftName = "LeftAlign" +alignRightName = "RightAlign" +alignCenterName = "CenterAlign" +firstListItemName = "first" +beginsWithName = "beginsWith-" +lowerRomanName = "lowerRoman" +upperRomanName = "upperRoman" +lowerAlphaName = "lowerAlpha" +upperAlphaName = "upperAlpha" +subListParName = "subParagraph" +footnoteName = "Footnote" +citeName = "Cite" + +-- | Convert Pandoc document to string in ICML format. +writeICML :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeICML opts (Pandoc meta blocks) = do + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + render' = render colwidth + renderMeta f s = liftM (render' . fst) $ runStateT (f opts [] s) defaultWriterState + metadata <- metaToJSON opts + (renderMeta blocksToICML) + (renderMeta inlinesToICML) + meta + (doc, st) <- runStateT (blocksToICML opts [] blocks) defaultWriterState + let main = render' doc + context = defField "body" main + $ defField "charStyles" (render' $ charStylesToDoc st) + $ defField "parStyles" (render' $ parStylesToDoc st) + $ defField "hyperlinks" (render' $ hyperlinksToDoc $ links st) + $ metadata + return $ case writerTemplate opts of + Nothing -> main + Just tpl -> renderTemplate' tpl context + +-- | Auxilary functions for parStylesToDoc and charStylesToDoc. +contains :: String -> (String, (String, String)) -> [(String, String)] +contains s rule = + if isInfixOf (fst rule) s + then [snd rule] + else [] + +-- | The monospaced font to use as default. +monospacedFont :: Doc +monospacedFont = inTags False "AppliedFont" [("type", "string")] $ text "Courier New" + +-- | How much to indent blockquotes etc. +defaultIndent :: Int +defaultIndent = 20 + +-- | How much to indent numbered lists before the number. +defaultListIndent :: Int +defaultListIndent = 10 + +-- other constants +lineSeparator :: String +lineSeparator = "
" + +-- | Convert a WriterState with its block styles to the ICML listing of Paragraph Styles. +parStylesToDoc :: WriterState -> Doc +parStylesToDoc st = vcat $ map makeStyle $ Set.toAscList $ blockStyles st + where + makeStyle s = + let countSubStrs sub str = length $ Text.breakOnAll (Text.pack sub) (Text.pack str) + attrs = concat $ map (contains s) $ [ + (defListTermName, ("BulletsAndNumberingListType", "BulletList")) + , (defListTermName, ("FontStyle", "Bold")) + , (tableHeaderName, ("FontStyle", "Bold")) + , (alignLeftName, ("Justification", "LeftAlign")) + , (alignRightName, ("Justification", "RightAlign")) + , (alignCenterName, ("Justification", "CenterAlign")) + , (headerName++"1", ("PointSize", "36")) + , (headerName++"2", ("PointSize", "30")) + , (headerName++"3", ("PointSize", "24")) + , (headerName++"4", ("PointSize", "18")) + , (headerName++"5", ("PointSize", "14")) + ] + -- what is the most nested list type, if any? + (isBulletList, isOrderedList) = findList $ reverse $ splitBy (==' ') s + where + findList [] = (False, False) + findList (x:xs) | x == bulletListName = (True, False) + | x == orderedListName = (False, True) + | otherwise = findList xs + nBuls = countSubStrs bulletListName s + nOrds = countSubStrs orderedListName s + attrs' = numbering ++ listType ++ indent ++ attrs + where + numbering | isOrderedList = [("NumberingExpression", "^#.^t"), ("NumberingLevel", show nOrds)] + | otherwise = [] + listType | isOrderedList && (not $ isInfixOf subListParName s) + = [("BulletsAndNumberingListType", "NumberedList")] + | isBulletList && (not $ isInfixOf subListParName s) + = [("BulletsAndNumberingListType", "BulletList")] + | otherwise = [] + indent = [("LeftIndent", show indt)] + where + nBlockQuotes = countSubStrs blockQuoteName s + nDefLists = countSubStrs defListDefName s + indt = max 0 $ defaultListIndent*(nBuls + nOrds - 1) + defaultIndent*(nBlockQuotes + nDefLists) + props = inTags True "Properties" [] $ (basedOn $$ tabList $$ numbForm) + where + font = if isInfixOf codeBlockName s + then monospacedFont + else empty + basedOn = inTags False "BasedOn" [("type", "object")] (text "$ID/NormalParagraphStyle") $$ font + tabList = if isBulletList + then inTags True "TabList" [("type","list")] $ inTags True "ListItem" [("type","record")] + $ vcat [ + inTags False "Alignment" [("type","enumeration")] $ text "LeftAlign" + , inTags False "AlignmentCharacter" [("type","string")] $ text "." + , selfClosingTag "Leader" [("type","string")] + , inTags False "Position" [("type","unit")] $ text + $ show $ defaultListIndent * (nBuls + nOrds) + ] + else empty + makeNumb name = inTags False "NumberingFormat" [("type", "string")] (text name) + numbForm | isInfixOf lowerRomanName s = makeNumb "i, ii, iii, iv..." + | isInfixOf upperRomanName s = makeNumb "I, II, III, IV..." + | isInfixOf lowerAlphaName s = makeNumb "a, b, c, d..." + | isInfixOf upperAlphaName s = makeNumb "A, B, C, D..." + | otherwise = empty + in inTags True "ParagraphStyle" ([("Self", "ParagraphStyle/"++s), ("Name", s)] ++ attrs') props + +-- | Convert a WriterState with its inline styles to the ICML listing of Character Styles. +charStylesToDoc :: WriterState -> Doc +charStylesToDoc st = vcat $ map makeStyle $ Set.toAscList $ inlineStyles st + where + makeStyle s = + let attrs = concat $ map (contains s) [ + (strikeoutName, ("StrikeThru", "true")) + , (superscriptName, ("Position", "Superscript")) + , (subscriptName, ("Position", "Subscript")) + , (smallCapsName, ("Capitalization", "SmallCaps")) + ] + attrs' | isInfixOf emphName s && isInfixOf strongName s = ("FontStyle", "Bold Italic") : attrs + | isInfixOf strongName s = ("FontStyle", "Bold") : attrs + | isInfixOf emphName s = ("FontStyle", "Italic") : attrs + | otherwise = attrs + props = inTags True "Properties" [] $ + inTags False "BasedOn" [("type", "object")] (text "$ID/NormalCharacterStyle") $$ font + where + font = + if isInfixOf codeName s + then monospacedFont + else empty + in inTags True "CharacterStyle" ([("Self", "CharacterStyle/"++s), ("Name", s)] ++ attrs') props + +-- | Escape colon characters as %3a +escapeColons :: String -> String +escapeColons (x:xs) + | x == ':' = "%3a" ++ escapeColons xs + | otherwise = x : escapeColons xs +escapeColons [] = [] + +-- | Convert a list of (identifier, url) pairs to the ICML listing of hyperlinks. +hyperlinksToDoc :: Hyperlink -> Doc +hyperlinksToDoc [] = empty +hyperlinksToDoc (x:xs) = hyp x $$ hyperlinksToDoc xs + where + hyp (ident, url) = hdest $$ hlink + where + hdest = selfClosingTag "HyperlinkURLDestination" + [("Self", "HyperlinkURLDestination/"++(escapeColons url)), ("Name","link"), ("DestinationURL",url), ("DestinationUniqueKey","1")] -- HyperlinkURLDestination with more than one colon crashes CS6 + hlink = inTags True "Hyperlink" [("Self","uf-"++show ident), ("Name",url), + ("Source","htss-"++show ident), ("Visible","true"), ("DestinationUniqueKey","1")] + $ inTags True "Properties" [] + $ inTags False "BorderColor" [("type","enumeration")] (text "Black") + $$ (inTags False "Destination" [("type","object")] + $ text $ "HyperlinkURLDestination/"++(escapeColons (escapeStringForXML url))) -- HyperlinkURLDestination with more than one colon crashes CS6 + + +-- | Convert a list of Pandoc blocks to ICML. +blocksToICML :: PandocMonad m => WriterOptions -> Style -> [Block] -> WS m Doc +blocksToICML opts style lst = do + docs <- mapM (blockToICML opts style) lst + return $ intersperseBrs docs + +-- | Convert a Pandoc block element to ICML. +blockToICML :: PandocMonad m => WriterOptions -> Style -> Block -> WS m Doc +blockToICML opts style (Plain lst) = parStyle opts style lst +-- title beginning with fig: indicates that the image is a figure +blockToICML opts style (Para img@[Image _ txt (_,'f':'i':'g':':':_)]) = do + figure <- parStyle opts (figureName:style) img + caption <- parStyle opts (imgCaptionName:style) txt + return $ intersperseBrs [figure, caption] +blockToICML opts style (Para lst) = parStyle opts (paragraphName:style) lst +blockToICML opts style (LineBlock lns) = + blockToICML opts style $ linesToPara lns +blockToICML opts style (CodeBlock _ str) = parStyle opts (codeBlockName:style) $ [Str str] +blockToICML _ _ b@(RawBlock f str) + | f == Format "icml" = return $ text str + | otherwise = do + report $ BlockNotRendered b + return empty +blockToICML opts style (BlockQuote blocks) = blocksToICML opts (blockQuoteName:style) blocks +blockToICML opts style (OrderedList attribs lst) = listItemsToICML opts orderedListName style (Just attribs) lst +blockToICML opts style (BulletList lst) = listItemsToICML opts bulletListName style Nothing lst +blockToICML opts style (DefinitionList lst) = intersperseBrs `fmap` mapM (definitionListItemToICML opts style) lst +blockToICML opts style (Header lvl _ lst) = + let stl = (headerName ++ show lvl):style + in parStyle opts stl lst +blockToICML _ _ HorizontalRule = return empty -- we could insert a page break instead +blockToICML opts style (Table caption aligns widths headers rows) = + let style' = tableName : style + noHeader = all null headers + nrHeaders = if noHeader + then "0" + else "1" + nrRows = length rows + nrCols = if null rows + then 0 + else length $ head rows + rowsToICML [] _ = return empty + rowsToICML (col:rest) rowNr = + liftM2 ($$) (colsToICML col aligns rowNr (0::Int)) $ rowsToICML rest (rowNr+1) + colsToICML [] _ _ _ = return empty + colsToICML _ [] _ _ = return empty + colsToICML (cell:rest) (alig:restAligns) rowNr colNr = do + let stl = if rowNr == 0 && not noHeader + then tableHeaderName:style' + else style' + stl' | alig == AlignLeft = alignLeftName : stl + | alig == AlignRight = alignRightName : stl + | alig == AlignCenter = alignCenterName : stl + | otherwise = stl + c <- blocksToICML opts stl' cell + let cl = return $ inTags True "Cell" + [("Name", show colNr ++":"++ show rowNr), ("AppliedCellStyle","CellStyle/Cell")] c + liftM2 ($$) cl $ colsToICML rest restAligns rowNr (colNr+1) + in do + let tabl = if noHeader + then rows + else headers:rows + cells <- rowsToICML tabl (0::Int) + let colWidths w = if w > 0 + then [("SingleColumnWidth",show $ 500 * w)] + else [] + let tupToDoc tup = selfClosingTag "Column" $ [("Name",show $ fst tup)] ++ (colWidths $ snd tup) + let colDescs = vcat $ map tupToDoc $ zip [0..nrCols-1] widths + let tableDoc = return $ inTags True "Table" [ + ("AppliedTableStyle","TableStyle/Table") + , ("HeaderRowCount", nrHeaders) + , ("BodyRowCount", show nrRows) + , ("ColumnCount", show nrCols) + ] (colDescs $$ cells) + liftM2 ($$) tableDoc $ parStyle opts (tableCaptionName:style) caption +blockToICML opts style (Div _ lst) = blocksToICML opts style lst +blockToICML _ _ Null = return empty + +-- | Convert a list of lists of blocks to ICML list items. +listItemsToICML :: PandocMonad m => WriterOptions -> String -> Style -> Maybe ListAttributes -> [[Block]] -> WS m Doc +listItemsToICML _ _ _ _ [] = return empty +listItemsToICML opts listType style attribs (first:rest) = do + st <- get + put st{ listDepth = 1 + listDepth st} + let stl = listType:style + let f = listItemToICML opts stl True attribs first + let r = map (listItemToICML opts stl False attribs) rest + docs <- sequence $ f:r + s <- get + let maxD = max (maxListDepth s) (listDepth s) + put s{ listDepth = 1, maxListDepth = maxD } + return $ intersperseBrs docs + +-- | Convert a list of blocks to ICML list items. +listItemToICML :: PandocMonad m => WriterOptions -> Style -> Bool-> Maybe ListAttributes -> [Block] -> WS m Doc +listItemToICML opts style isFirst attribs item = + let makeNumbStart (Just (beginsWith, numbStl, _)) = + let doN DefaultStyle = [] + doN LowerRoman = [lowerRomanName] + doN UpperRoman = [upperRomanName] + doN LowerAlpha = [lowerAlphaName] + doN UpperAlpha = [upperAlphaName] + doN _ = [] + bw = if beginsWith > 1 + then [beginsWithName ++ show beginsWith] + else [] + in doN numbStl ++ bw + makeNumbStart Nothing = [] + stl = if isFirst + then firstListItemName:style + else style + stl' = makeNumbStart attribs ++ stl + in if length item > 1 + then do + let insertTab (Para lst) = blockToICML opts (subListParName:style) $ Para $ (Str "\t"):lst + insertTab block = blockToICML opts style block + f <- blockToICML opts stl' $ head item + r <- mapM insertTab $ tail item + return $ intersperseBrs (f : r) + else blocksToICML opts stl' item + +definitionListItemToICML :: PandocMonad m => WriterOptions -> Style -> ([Inline],[[Block]]) -> WS m Doc +definitionListItemToICML opts style (term,defs) = do + term' <- parStyle opts (defListTermName:style) term + defs' <- mapM (blocksToICML opts (defListDefName:style)) defs + return $ intersperseBrs $ (term' : defs') + + +-- | Convert a list of inline elements to ICML. +inlinesToICML :: PandocMonad m => WriterOptions -> Style -> [Inline] -> WS m Doc +inlinesToICML opts style lst = vcat `fmap` mapM (inlineToICML opts style) (mergeSpaces lst) + +-- | Convert an inline element to ICML. +inlineToICML :: PandocMonad m => WriterOptions -> Style -> Inline -> WS m Doc +inlineToICML _ style (Str str) = charStyle style $ text $ escapeStringForXML str +inlineToICML opts style (Emph lst) = inlinesToICML opts (emphName:style) lst +inlineToICML opts style (Strong lst) = inlinesToICML opts (strongName:style) lst +inlineToICML opts style (Strikeout lst) = inlinesToICML opts (strikeoutName:style) lst +inlineToICML opts style (Superscript lst) = inlinesToICML opts (superscriptName:style) lst +inlineToICML opts style (Subscript lst) = inlinesToICML opts (subscriptName:style) lst +inlineToICML opts style (SmallCaps lst) = inlinesToICML opts (smallCapsName:style) lst +inlineToICML opts style (Quoted SingleQuote lst) = inlinesToICML opts style $ [Str "‘"] ++ lst ++ [Str "’"] +inlineToICML opts style (Quoted DoubleQuote lst) = inlinesToICML opts style $ [Str "“"] ++ lst ++ [Str "”"] +inlineToICML opts style (Cite _ lst) = inlinesToICML opts (citeName:style) lst +inlineToICML _ style (Code _ str) = charStyle (codeName:style) $ text $ escapeStringForXML str +inlineToICML _ style Space = charStyle style space +inlineToICML opts style SoftBreak = + case writerWrapText opts of + WrapAuto -> charStyle style space + WrapNone -> charStyle style space + WrapPreserve -> charStyle style cr +inlineToICML _ style LineBreak = charStyle style $ text lineSeparator +inlineToICML opts style (Math mt str) = + lift (texMathToInlines mt str) >>= + (fmap cat . mapM (inlineToICML opts style)) +inlineToICML _ _ il@(RawInline f str) + | f == Format "icml" = return $ text str + | otherwise = do + report $ InlineNotRendered il + return empty +inlineToICML opts style (Link _ lst (url, title)) = do + content <- inlinesToICML opts (linkName:style) lst + state $ \st -> + let ident = if null $ links st + then 1::Int + else 1 + (fst $ head $ links st) + newst = st{ links = (ident, url):(links st) } + cont = inTags True "HyperlinkTextSource" + [("Self","htss-"++show ident), ("Name",title), ("Hidden","false")] content + in (cont, newst) +inlineToICML opts style (Image attr _ target) = imageICML opts style attr target +inlineToICML opts style (Note lst) = footnoteToICML opts style lst +inlineToICML opts style (Span _ lst) = inlinesToICML opts style lst + +-- | Convert a list of block elements to an ICML footnote. +footnoteToICML :: PandocMonad m => WriterOptions -> Style -> [Block] -> WS m Doc +footnoteToICML opts style lst = + let insertTab (Para ls) = blockToICML opts (footnoteName:style) $ Para $ (Str "\t"):ls + insertTab block = blockToICML opts (footnoteName:style) block + in do + contents <- mapM insertTab lst + let number = inTags True "ParagraphStyleRange" [] $ + inTags True "CharacterStyleRange" [] $ inTagsSimple "Content" "<?ACE 4?>" + return $ inTags True "CharacterStyleRange" + [("AppliedCharacterStyle","$ID/NormalCharacterStyle"), ("Position","Superscript")] + $ inTags True "Footnote" [] $ number $$ intersperseBrs contents + +-- | Auxiliary function to merge Space elements into the adjacent Strs. +mergeSpaces :: [Inline] -> [Inline] +mergeSpaces ((Str s):(x:((Str s'):xs))) | isSp x = + mergeSpaces $ Str(s++" "++s') : xs +mergeSpaces (x:((Str s):xs)) | isSp x = mergeSpaces $ Str (" "++s) : xs +mergeSpaces ((Str s):(x:xs)) | isSp x = mergeSpaces $ Str (s++" ") : xs +mergeSpaces (x:xs) = x : (mergeSpaces xs) +mergeSpaces [] = [] + +isSp :: Inline -> Bool +isSp Space = True +isSp SoftBreak = True +isSp _ = False + +-- | Intersperse line breaks +intersperseBrs :: [Doc] -> Doc +intersperseBrs = vcat . intersperse (selfClosingTag "Br" []) . filter (not . isEmpty) + +-- | Wrap a list of inline elements in an ICML Paragraph Style +parStyle :: PandocMonad m => WriterOptions -> Style -> [Inline] -> WS m Doc +parStyle opts style lst = + let slipIn x y = if null y + then x + else x ++ " > " ++ y + stlStr = foldr slipIn [] $ reverse style + stl = if null stlStr + then "" + else "ParagraphStyle/" ++ stlStr + attrs = ("AppliedParagraphStyle", stl) + attrs' = if firstListItemName `elem` style + then let ats = attrs : [("NumberingContinue", "false")] + begins = filter (isPrefixOf beginsWithName) style + in if null begins + then ats + else let i = maybe "" id $ stripPrefix beginsWithName $ head begins + in ("NumberingStartAt", i) : ats + else [attrs] + in do + content <- inlinesToICML opts [] lst + let cont = inTags True "ParagraphStyleRange" attrs' content + state $ \st -> (cont, st{ blockStyles = Set.insert stlStr $ blockStyles st }) + +-- | Wrap a Doc in an ICML Character Style. +charStyle :: PandocMonad m => Style -> Doc -> WS m Doc +charStyle style content = + let (stlStr, attrs) = styleToStrAttr style + doc = inTags True "CharacterStyleRange" attrs $ inTagsSimple "Content" $ flush content + in do + state $ \st -> + let styles = if null stlStr + then st + else st{ inlineStyles = Set.insert stlStr $ inlineStyles st } + in (doc, styles) + +-- | Transform a Style to a tuple of String (eliminating duplicates and ordered) and corresponding attribute. +styleToStrAttr :: Style -> (String, [(String, String)]) +styleToStrAttr style = + let stlStr = unwords $ Set.toAscList $ Set.fromList style + stl = if null style + then "$ID/NormalCharacterStyle" + else "CharacterStyle/" ++ stlStr + attrs = [("AppliedCharacterStyle", stl)] + in (stlStr, attrs) + +-- | Assemble an ICML Image. +imageICML :: PandocMonad m => WriterOptions -> Style -> Attr -> Target -> WS m Doc +imageICML opts style attr (src, _) = do + res <- runExceptT $ lift $ P.fetchItem (writerSourceURL opts) src + imgS <- case res of + Left (_ :: PandocError) -> do + report $ CouldNotFetchResource src "" + return def + Right (img, _) -> do + case imageSize img of + Right size -> return size + Left msg -> do + report $ CouldNotDetermineImageSize src msg + return def + let (ow, oh) = sizeInPoints imgS + (imgWidth, imgHeight) = desiredSizeInPoints opts attr imgS + hw = showFl $ ow / 2 + hh = showFl $ oh / 2 + scale = showFl (imgWidth / ow) ++ " 0 0 " ++ showFl (imgHeight / oh) + src' = if isURI src then src else "file:" ++ src + (stlStr, attrs) = styleToStrAttr style + props = inTags True "Properties" [] $ inTags True "PathGeometry" [] + $ inTags True "GeometryPathType" [("PathOpen","false")] + $ inTags True "PathPointArray" [] + $ vcat [ + selfClosingTag "PathPointType" [("Anchor", "-"++hw++" -"++hh), + ("LeftDirection", "-"++hw++" -"++hh), ("RightDirection", "-"++hw++" -"++hh)] + , selfClosingTag "PathPointType" [("Anchor", "-"++hw++" "++hh), + ("LeftDirection", "-"++hw++" "++hh), ("RightDirection", "-"++hw++" "++hh)] + , selfClosingTag "PathPointType" [("Anchor", hw++" "++hh), + ("LeftDirection", hw++" "++hh), ("RightDirection", hw++" "++hh)] + , selfClosingTag "PathPointType" [("Anchor", hw++" -"++hh), + ("LeftDirection", hw++" -"++hh), ("RightDirection", hw++" -"++hh)] + ] + image = inTags True "Image" + [("Self","ue6"), ("ItemTransform", scale++" -"++hw++" -"++hh)] + $ vcat [ + inTags True "Properties" [] $ inTags True "Profile" [("type","string")] $ text "$ID/Embedded" + , selfClosingTag "Link" [("Self", "ueb"), ("LinkResourceURI", src')] + ] + doc = inTags True "CharacterStyleRange" attrs + $ inTags True "Rectangle" [("Self","uec"), ("StrokeWeight", "0"), + ("ItemTransform", scale++" "++hw++" -"++hh)] + $ (props $$ image) + state $ \st -> (doc, st{ inlineStyles = Set.insert stlStr $ inlineStyles st } ) diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs new file mode 100644 index 000000000..ac2b5d758 --- /dev/null +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -0,0 +1,1388 @@ +{-# LANGUAGE OverloadedStrings, ScopedTypeVariables, + PatternGuards #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.LaTeX + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' format into LaTeX. +-} +module Text.Pandoc.Writers.LaTeX ( + writeLaTeX + , writeBeamer + ) where +import Text.Pandoc.Definition +import Text.Pandoc.Walk +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import Text.Pandoc.Templates +import Text.Pandoc.Logging +import Text.Printf ( printf ) +import Network.URI ( isURI, unEscapeString ) +import Data.Aeson (object, (.=), FromJSON) +import Data.List ( (\\), isInfixOf, stripPrefix, intercalate, intersperse, + nub, nubBy, foldl' ) +import Data.Char ( toLower, isPunctuation, isAscii, isLetter, isDigit, + ord, isAlphaNum ) +import Data.Maybe ( fromMaybe, isJust, catMaybes ) +import qualified Data.Text as T +import Control.Applicative ((<|>)) +import Control.Monad.State +import qualified Text.Parsec as P +import Text.Pandoc.Pretty +import Text.Pandoc.ImageSize +import Text.Pandoc.Slides +import Text.Pandoc.Highlighting (highlight, styleToLaTeX, + formatLaTeXInline, formatLaTeXBlock, + toListingsLanguage) +import Text.Pandoc.Class (PandocMonad, report) + +data WriterState = + WriterState { stInNote :: Bool -- true if we're in a note + , stInQuote :: Bool -- true if in a blockquote + , stInMinipage :: Bool -- true if in minipage + , stInHeading :: Bool -- true if in a section heading + , stNotes :: [Doc] -- notes in a minipage + , stOLLevel :: Int -- level of ordered list nesting + , stOptions :: WriterOptions -- writer options, so they don't have to be parameter + , stVerbInNote :: Bool -- true if document has verbatim text in note + , stTable :: Bool -- true if document has a table + , stStrikeout :: Bool -- true if document has strikeout + , stUrl :: Bool -- true if document has visible URL link + , stGraphics :: Bool -- true if document contains images + , stLHS :: Bool -- true if document has literate haskell code + , stBook :: Bool -- true if document uses book or memoir class + , stCsquotes :: Bool -- true if document uses csquotes + , stHighlighting :: Bool -- true if document has highlighted code + , stIncremental :: Bool -- true if beamer lists should be displayed bit by bit + , stInternalLinks :: [String] -- list of internal link targets + , stUsesEuro :: Bool -- true if euro symbol used + , stBeamer :: Bool -- produce beamer + } + +startingState :: WriterOptions -> WriterState +startingState options = WriterState { + stInNote = False + , stInQuote = False + , stInMinipage = False + , stInHeading = False + , stNotes = [] + , stOLLevel = 1 + , stOptions = options + , stVerbInNote = False + , stTable = False + , stStrikeout = False + , stUrl = False + , stGraphics = False + , stLHS = False + , stBook = (case writerTopLevelDivision options of + TopLevelPart -> True + TopLevelChapter -> True + _ -> False) + , stCsquotes = False + , stHighlighting = False + , stIncremental = writerIncremental options + , stInternalLinks = [] + , stUsesEuro = False + , stBeamer = False } + +-- | Convert Pandoc to LaTeX. +writeLaTeX :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeLaTeX options document = + evalStateT (pandocToLaTeX options document) $ + startingState options + +-- | Convert Pandoc to LaTeX Beamer. +writeBeamer :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeBeamer options document = + evalStateT (pandocToLaTeX options document) $ + (startingState options){ stBeamer = True } + +type LW m = StateT WriterState m + +pandocToLaTeX :: PandocMonad m + => WriterOptions -> Pandoc -> LW m String +pandocToLaTeX options (Pandoc meta blocks) = do + -- Strip off final 'references' header if --natbib or --biblatex + let method = writerCiteMethod options + let blocks' = if method == Biblatex || method == Natbib + then case reverse blocks of + (Div (_,["references"],_) _):xs -> reverse xs + _ -> blocks + else blocks + -- see if there are internal links + let isInternalLink (Link _ _ ('#':xs,_)) = [xs] + isInternalLink _ = [] + modify $ \s -> s{ stInternalLinks = query isInternalLink blocks' } + let template = maybe "" id $ writerTemplate options + -- set stBook depending on documentclass + let colwidth = if writerWrapText options == WrapAuto + then Just $ writerColumns options + else Nothing + metadata <- metaToJSON options + (fmap (render colwidth) . blockListToLaTeX) + (fmap (render colwidth) . inlineListToLaTeX) + meta + let bookClasses = ["memoir","book","report","scrreprt","scrbook"] + let documentClass = case P.parse pDocumentClass "template" template of + Right r -> r + Left _ -> "" + case lookup "documentclass" (writerVariables options) `mplus` + fmap stringify (lookupMeta "documentclass" meta) of + Just x | x `elem` bookClasses -> modify $ \s -> s{stBook = True} + | otherwise -> return () + Nothing | documentClass `elem` bookClasses + -> modify $ \s -> s{stBook = True} + | otherwise -> return () + -- check for \usepackage...{csquotes}; if present, we'll use + -- \enquote{...} for smart quotes: + let headerIncludesField :: FromJSON a => Maybe a + headerIncludesField = getField "header-includes" metadata + let headerIncludes = fromMaybe [] $ mplus + (fmap return headerIncludesField) + headerIncludesField + when (any (isInfixOf "{csquotes}") (template : headerIncludes)) $ + modify $ \s -> s{stCsquotes = True} + let (blocks'', lastHeader) = if writerCiteMethod options == Citeproc then + (blocks', []) + else case last blocks' of + Header 1 _ il -> (init blocks', il) + _ -> (blocks', []) + beamer <- gets stBeamer + blocks''' <- if beamer + then toSlides blocks'' + else return blocks'' + body <- mapM (elementToLaTeX options) $ hierarchicalize blocks''' + (biblioTitle :: String) <- liftM (render colwidth) $ inlineListToLaTeX lastHeader + let main = render colwidth $ vsep body + st <- get + titleMeta <- stringToLaTeX TextString $ stringify $ docTitle meta + authorsMeta <- mapM (stringToLaTeX TextString . stringify) $ docAuthors meta + let docLangs = nub $ query (extract "lang") blocks + let hasStringValue x = isJust (getField x metadata :: Maybe String) + let geometryFromMargins = intercalate [','] $ catMaybes $ + map (\(x,y) -> + ((x ++ "=") ++) <$> getField y metadata) + [("lmargin","margin-left") + ,("rmargin","margin-right") + ,("tmargin","margin-top") + ,("bmargin","margin-bottom") + ] + let context = defField "toc" (writerTableOfContents options) $ + defField "toc-depth" (show (writerTOCDepth options - + if stBook st + then 1 + else 0)) $ + defField "body" main $ + defField "title-meta" titleMeta $ + defField "author-meta" (intercalate "; " authorsMeta) $ + defField "documentclass" (if beamer + then ("beamer" :: String) + else if stBook st + then "book" + else "article") $ + defField "verbatim-in-note" (stVerbInNote st) $ + defField "tables" (stTable st) $ + defField "strikeout" (stStrikeout st) $ + defField "url" (stUrl st) $ + defField "numbersections" (writerNumberSections options) $ + defField "lhs" (stLHS st) $ + defField "graphics" (stGraphics st) $ + defField "book-class" (stBook st) $ + defField "euro" (stUsesEuro st) $ + defField "listings" (writerListings options || stLHS st) $ + defField "beamer" beamer $ + (if stHighlighting st + then case writerHighlightStyle options of + Just sty -> + defField "highlighting-macros" + (styleToLaTeX sty) + Nothing -> id + else id) $ + (case writerCiteMethod options of + Natbib -> defField "biblio-title" biblioTitle . + defField "natbib" True + Biblatex -> defField "biblio-title" biblioTitle . + defField "biblatex" True + _ -> id) $ + -- set lang to something so polyglossia/babel is included + defField "lang" (if null docLangs then ""::String else "en") $ + defField "otherlangs" docLangs $ + defField "colorlinks" (any hasStringValue + ["citecolor", "urlcolor", "linkcolor", "toccolor"]) $ + defField "dir" (if (null $ query (extract "dir") blocks) + then ""::String + else "ltr") $ + defField "section-titles" True $ + defField "geometry" geometryFromMargins $ + metadata + let toPolyObj lang = object [ "name" .= T.pack name + , "options" .= T.pack opts ] + where + (name, opts) = toPolyglossia lang + let lang = maybe [] (splitBy (=='-')) $ getField "lang" context + otherlangs = maybe [] (map $ splitBy (=='-')) $ getField "otherlangs" context + let context' = + defField "babel-lang" (toBabel lang) + $ defField "babel-otherlangs" (map toBabel otherlangs) + $ defField "babel-newcommands" (concatMap (\(poly, babel) -> + -- \textspanish and \textgalician are already used by babel + -- save them as \oritext... and let babel use that + if poly `elem` ["spanish", "galician"] + then "\\let\\oritext" ++ poly ++ "\\text" ++ poly ++ "\n" ++ + "\\AddBabelHook{" ++ poly ++ "}{beforeextras}" ++ + "{\\renewcommand{\\text" ++ poly ++ "}{\\oritext" + ++ poly ++ "}}\n" ++ + "\\AddBabelHook{" ++ poly ++ "}{afterextras}" ++ + "{\\renewcommand{\\text" ++ poly ++ "}[2][]{\\foreignlanguage{" + ++ poly ++ "}{##2}}}\n" + else "\\newcommand{\\text" ++ poly ++ "}[2][]{\\foreignlanguage{" + ++ babel ++ "}{#2}}\n" ++ + "\\newenvironment{" ++ poly ++ "}[2][]{\\begin{otherlanguage}{" + ++ babel ++ "}}{\\end{otherlanguage}}\n" + ) + -- eliminate duplicates that have same polyglossia name + $ nubBy (\a b -> fst a == fst b) + -- find polyglossia and babel names of languages used in the document + $ map (\l -> + let lng = splitBy (=='-') l + in (fst $ toPolyglossia lng, toBabel lng) + ) + docLangs ) + $ defField "polyglossia-lang" (toPolyObj lang) + $ defField "polyglossia-otherlangs" (map toPolyObj otherlangs) + $ defField "latex-dir-rtl" (case (getField "dir" context)::Maybe String of + Just "rtl" -> True + _ -> False) + $ context + return $ case writerTemplate options of + Nothing -> main + Just tpl -> renderTemplate' tpl context' + +-- | Convert Elements to LaTeX +elementToLaTeX :: PandocMonad m => WriterOptions -> Element -> LW m Doc +elementToLaTeX _ (Blk block) = blockToLaTeX block +elementToLaTeX opts (Sec level _ (id',classes,_) title' elements) = do + modify $ \s -> s{stInHeading = True} + header' <- sectionHeader ("unnumbered" `elem` classes) id' level title' + modify $ \s -> s{stInHeading = False} + innerContents <- mapM (elementToLaTeX opts) elements + return $ vsep (header' : innerContents) + +data StringContext = TextString + | URLString + | CodeString + deriving (Eq) + +-- escape things as needed for LaTeX +stringToLaTeX :: PandocMonad m => StringContext -> String -> LW m String +stringToLaTeX _ [] = return "" +stringToLaTeX ctx (x:xs) = do + opts <- gets stOptions + rest <- stringToLaTeX ctx xs + let ligatures = isEnabled Ext_smart opts && ctx == TextString + let isUrl = ctx == URLString + when (x == '€') $ + modify $ \st -> st{ stUsesEuro = True } + return $ + case x of + '€' -> "\\euro{}" ++ rest + '{' -> "\\{" ++ rest + '}' -> "\\}" ++ rest + '`' | ctx == CodeString -> "\\textasciigrave{}" ++ rest + '$' | not isUrl -> "\\$" ++ rest + '%' -> "\\%" ++ rest + '&' -> "\\&" ++ rest + '_' | not isUrl -> "\\_" ++ rest + '#' -> "\\#" ++ rest + '-' | not isUrl -> case xs of + -- prevent adjacent hyphens from forming ligatures + ('-':_) -> "-\\/" ++ rest + _ -> '-' : rest + '~' | not isUrl -> "\\textasciitilde{}" ++ rest + '^' -> "\\^{}" ++ rest + '\\'| isUrl -> '/' : rest -- NB. / works as path sep even on Windows + | otherwise -> "\\textbackslash{}" ++ rest + '|' | not isUrl -> "\\textbar{}" ++ rest + '<' -> "\\textless{}" ++ rest + '>' -> "\\textgreater{}" ++ rest + '[' -> "{[}" ++ rest -- to avoid interpretation as + ']' -> "{]}" ++ rest -- optional arguments + '\'' | ctx == CodeString -> "\\textquotesingle{}" ++ rest + '\160' -> "~" ++ rest + '\x202F' -> "\\," ++ rest + '\x2026' -> "\\ldots{}" ++ rest + '\x2018' | ligatures -> "`" ++ rest + '\x2019' | ligatures -> "'" ++ rest + '\x201C' | ligatures -> "``" ++ rest + '\x201D' | ligatures -> "''" ++ rest + '\x2014' | ligatures -> "---" ++ rest + '\x2013' | ligatures -> "--" ++ rest + _ -> x : rest + +toLabel :: PandocMonad m => String -> LW m String +toLabel z = go `fmap` stringToLaTeX URLString z + where go [] = "" + go (x:xs) + | (isLetter x || isDigit x) && isAscii x = x:go xs + | elem x ("_-+=:;." :: String) = x:go xs + | otherwise = "ux" ++ printf "%x" (ord x) ++ go xs + +-- | Puts contents into LaTeX command. +inCmd :: String -> Doc -> Doc +inCmd cmd contents = char '\\' <> text cmd <> braces contents + +toSlides :: PandocMonad m => [Block] -> LW m [Block] +toSlides bs = do + opts <- gets stOptions + let slideLevel = fromMaybe (getSlideLevel bs) $ writerSlideLevel opts + let bs' = prepSlides slideLevel bs + concat `fmap` (mapM (elementToBeamer slideLevel) $ hierarchicalize bs') + +elementToBeamer :: PandocMonad m => Int -> Element -> LW m [Block] +elementToBeamer _slideLevel (Blk b) = return [b] +elementToBeamer slideLevel (Sec lvl _num (ident,classes,kvs) tit elts) + | lvl > slideLevel = do + bs <- concat `fmap` mapM (elementToBeamer slideLevel) elts + return $ Para ( RawInline "latex" "\\begin{block}{" + : tit ++ [RawInline "latex" "}"] ) + : bs ++ [RawBlock "latex" "\\end{block}"] + | lvl < slideLevel = do + bs <- concat `fmap` mapM (elementToBeamer slideLevel) elts + return $ (Header lvl (ident,classes,kvs) tit) : bs + | otherwise = do -- lvl == slideLevel + -- note: [fragile] is required or verbatim breaks + let hasCodeBlock (CodeBlock _ _) = [True] + hasCodeBlock _ = [] + let hasCode (Code _ _) = [True] + hasCode _ = [] + let fragile = "fragile" `elem` classes || + not (null $ query hasCodeBlock elts ++ query hasCode elts) + let frameoptions = ["allowdisplaybreaks", "allowframebreaks", + "b", "c", "t", "environment", + "label", "plain", "shrink", "standout"] + let optionslist = ["fragile" | fragile] ++ + [k | k <- classes, k `elem` frameoptions] ++ + [k ++ "=" ++ v | (k,v) <- kvs, k `elem` frameoptions] + let options = if null optionslist + then "" + else "[" ++ intercalate "," optionslist ++ "]" + let slideStart = Para $ RawInline "latex" ("\\begin{frame}" ++ options) : + if tit == [Str "\0"] -- marker for hrule + then [] + else (RawInline "latex" "{") : tit ++ [RawInline "latex" "}"] + let slideEnd = RawBlock "latex" "\\end{frame}" + -- now carve up slide into blocks if there are sections inside + bs <- concat `fmap` mapM (elementToBeamer slideLevel) elts + return $ slideStart : bs ++ [slideEnd] + +isListBlock :: Block -> Bool +isListBlock (BulletList _) = True +isListBlock (OrderedList _ _) = True +isListBlock (DefinitionList _) = True +isListBlock _ = False + +isLineBreakOrSpace :: Inline -> Bool +isLineBreakOrSpace LineBreak = True +isLineBreakOrSpace SoftBreak = True +isLineBreakOrSpace Space = True +isLineBreakOrSpace _ = False + +-- | Convert Pandoc block element to LaTeX. +blockToLaTeX :: PandocMonad m + => Block -- ^ Block to convert + -> LW m Doc +blockToLaTeX Null = return empty +blockToLaTeX (Div (identifier,classes,kvs) bs) = do + beamer <- gets stBeamer + ref <- toLabel identifier + let linkAnchor = if null identifier + then empty + else "\\hypertarget" <> braces (text ref) <> + braces empty + let align dir txt = inCmd "begin" dir $$ txt $$ inCmd "end" dir + let wrapDir = case lookup "dir" kvs of + Just "rtl" -> align "RTL" + Just "ltr" -> align "LTR" + _ -> id + wrapLang txt = case lookup "lang" kvs of + Just lng -> let (l, o) = toPolyglossiaEnv lng + ops = if null o + then "" + else brackets $ text o + in inCmd "begin" (text l) <> ops + $$ blankline <> txt <> blankline + $$ inCmd "end" (text l) + Nothing -> txt + wrapNotes txt = if beamer && "notes" `elem` classes + then "\\note" <> braces txt -- speaker notes + else linkAnchor $$ txt + fmap (wrapDir . wrapLang . wrapNotes) $ blockListToLaTeX bs +blockToLaTeX (Plain lst) = + inlineListToLaTeX $ dropWhile isLineBreakOrSpace lst +-- title beginning with fig: indicates that the image is a figure +blockToLaTeX (Para [Image attr@(ident, _, _) txt (src,'f':'i':'g':':':tit)]) = do + inNote <- gets stInNote + modify $ \st -> st{ stInMinipage = True, stNotes = [] } + capt <- inlineListToLaTeX txt + notes <- gets stNotes + modify $ \st -> st{ stInMinipage = False, stNotes = [] } + + -- We can't have footnotes in the list of figures, so remove them: + captForLof <- if null notes + then return empty + else brackets <$> inlineListToLaTeX (walk deNote txt) + img <- inlineToLaTeX (Image attr txt (src,tit)) + let footnotes = notesToLaTeX notes + lab <- labelFor ident + let caption = "\\caption" <> captForLof <> braces capt <> lab + figure <- hypertarget ident (cr <> + "\\begin{figure}" $$ "\\centering" $$ img $$ + caption $$ "\\end{figure}" <> cr) + return $ if inNote + -- can't have figures in notes + then "\\begin{center}" $$ img $+$ capt $$ "\\end{center}" + else figure $$ footnotes +-- . . . indicates pause in beamer slides +blockToLaTeX (Para [Str ".",Space,Str ".",Space,Str "."]) = do + beamer <- gets stBeamer + if beamer + then blockToLaTeX (RawBlock "latex" "\\pause") + else inlineListToLaTeX [Str ".",Space,Str ".",Space,Str "."] +blockToLaTeX (Para lst) = + inlineListToLaTeX $ dropWhile isLineBreakOrSpace lst +blockToLaTeX (LineBlock lns) = do + blockToLaTeX $ linesToPara lns +blockToLaTeX (BlockQuote lst) = do + beamer <- gets stBeamer + case lst of + [b] | beamer && isListBlock b -> do + oldIncremental <- gets stIncremental + modify $ \s -> s{ stIncremental = not oldIncremental } + result <- blockToLaTeX b + modify $ \s -> s{ stIncremental = oldIncremental } + return result + _ -> do + oldInQuote <- gets stInQuote + modify (\s -> s{stInQuote = True}) + contents <- blockListToLaTeX lst + modify (\s -> s{stInQuote = oldInQuote}) + return $ "\\begin{quote}" $$ contents $$ "\\end{quote}" +blockToLaTeX (CodeBlock (identifier,classes,keyvalAttr) str) = do + opts <- gets stOptions + ref <- toLabel identifier + let linkAnchor = if null identifier + then empty + else "\\hypertarget" <> braces (text ref) <> + braces ("\\label" <> braces (text ref)) + let lhsCodeBlock = do + modify $ \s -> s{ stLHS = True } + return $ flush (linkAnchor $$ "\\begin{code}" $$ text str $$ + "\\end{code}") $$ cr + let rawCodeBlock = do + st <- get + env <- if stInNote st + then modify (\s -> s{ stVerbInNote = True }) >> + return "Verbatim" + else return "verbatim" + return $ flush (linkAnchor $$ text ("\\begin{" ++ env ++ "}") $$ + text str $$ text ("\\end{" ++ env ++ "}")) <> cr + let listingsCodeBlock = do + st <- get + let params = if writerListings (stOptions st) + then (case getListingsLanguage classes of + Just l -> [ "language=" ++ mbBraced l ] + Nothing -> []) ++ + [ "numbers=left" | "numberLines" `elem` classes + || "number" `elem` classes + || "number-lines" `elem` classes ] ++ + [ (if key == "startFrom" + then "firstnumber" + else key) ++ "=" ++ mbBraced attr | + (key,attr) <- keyvalAttr ] ++ + (if identifier == "" + then [] + else [ "label=" ++ ref ]) + + else [] + printParams + | null params = empty + | otherwise = brackets $ hcat (intersperse ", " + (map text params)) + return $ flush ("\\begin{lstlisting}" <> printParams $$ text str $$ + "\\end{lstlisting}") $$ cr + let highlightedCodeBlock = + case highlight formatLaTeXBlock ("",classes,keyvalAttr) str of + Nothing -> rawCodeBlock + Just h -> modify (\st -> st{ stHighlighting = True }) >> + return (flush $ linkAnchor $$ text (T.unpack h)) + case () of + _ | isEnabled Ext_literate_haskell opts && "haskell" `elem` classes && + "literate" `elem` classes -> lhsCodeBlock + | writerListings opts -> listingsCodeBlock + | not (null classes) && isJust (writerHighlightStyle opts) + -> highlightedCodeBlock + | otherwise -> rawCodeBlock +blockToLaTeX b@(RawBlock f x) + | f == Format "latex" || f == Format "tex" + = return $ text x + | otherwise = do + report $ BlockNotRendered b + return empty +blockToLaTeX (BulletList []) = return empty -- otherwise latex error +blockToLaTeX (BulletList lst) = do + incremental <- gets stIncremental + beamer <- gets stBeamer + let inc = if beamer && incremental then "[<+->]" else "" + items <- mapM listItemToLaTeX lst + let spacing = if isTightList lst + then text "\\tightlist" + else empty + return $ text ("\\begin{itemize}" ++ inc) $$ spacing $$ vcat items $$ + "\\end{itemize}" +blockToLaTeX (OrderedList _ []) = return empty -- otherwise latex error +blockToLaTeX (OrderedList (start, numstyle, numdelim) lst) = do + st <- get + let inc = if stIncremental st then "[<+->]" else "" + let oldlevel = stOLLevel st + put $ st {stOLLevel = oldlevel + 1} + items <- mapM listItemToLaTeX lst + modify (\s -> s {stOLLevel = oldlevel}) + let tostyle x = case numstyle of + Decimal -> "\\arabic" <> braces x + UpperRoman -> "\\Roman" <> braces x + LowerRoman -> "\\roman" <> braces x + UpperAlpha -> "\\Alph" <> braces x + LowerAlpha -> "\\alph" <> braces x + Example -> "\\arabic" <> braces x + DefaultStyle -> "\\arabic" <> braces x + let todelim x = case numdelim of + OneParen -> x <> ")" + TwoParens -> parens x + Period -> x <> "." + _ -> x <> "." + let enum = text $ "enum" ++ map toLower (toRomanNumeral oldlevel) + let stylecommand = if numstyle == DefaultStyle && numdelim == DefaultDelim + then empty + else "\\def" <> "\\label" <> enum <> + braces (todelim $ tostyle enum) + let resetcounter = if start == 1 || oldlevel > 4 + then empty + else "\\setcounter" <> braces enum <> + braces (text $ show $ start - 1) + let spacing = if isTightList lst + then text "\\tightlist" + else empty + return $ text ("\\begin{enumerate}" ++ inc) + $$ stylecommand + $$ resetcounter + $$ spacing + $$ vcat items + $$ "\\end{enumerate}" +blockToLaTeX (DefinitionList []) = return empty +blockToLaTeX (DefinitionList lst) = do + incremental <- gets stIncremental + let inc = if incremental then "[<+->]" else "" + items <- mapM defListItemToLaTeX lst + let spacing = if all isTightList (map snd lst) + then text "\\tightlist" + else empty + return $ text ("\\begin{description}" ++ inc) $$ spacing $$ vcat items $$ + "\\end{description}" +blockToLaTeX HorizontalRule = return $ + "\\begin{center}\\rule{0.5\\linewidth}{\\linethickness}\\end{center}" +blockToLaTeX (Header level (id',classes,_) lst) = do + modify $ \s -> s{stInHeading = True} + hdr <- sectionHeader ("unnumbered" `elem` classes) id' level lst + modify $ \s -> s{stInHeading = False} + return hdr +blockToLaTeX (Table caption aligns widths heads rows) = do + headers <- if all null heads + then return empty + else do + contents <- (tableRowToLaTeX True aligns widths) heads + return ("\\toprule" $$ contents $$ "\\midrule") + let endhead = if all null heads + then empty + else text "\\endhead" + let endfirsthead = if all null heads + then empty + else text "\\endfirsthead" + captionText <- inlineListToLaTeX caption + let capt = if isEmpty captionText + then empty + else text "\\caption" <> braces captionText <> "\\tabularnewline" + $$ headers + $$ endfirsthead + rows' <- mapM (tableRowToLaTeX False aligns widths) rows + let colDescriptors = text $ concat $ map toColDescriptor aligns + modify $ \s -> s{ stTable = True } + return $ "\\begin{longtable}[]" <> + braces ("@{}" <> colDescriptors <> "@{}") + -- the @{} removes extra space at beginning and end + $$ capt + $$ (if all null heads then "\\toprule" else empty) + $$ headers + $$ endhead + $$ vcat rows' + $$ "\\bottomrule" + $$ "\\end{longtable}" + +toColDescriptor :: Alignment -> String +toColDescriptor align = + case align of + AlignLeft -> "l" + AlignRight -> "r" + AlignCenter -> "c" + AlignDefault -> "l" + +blockListToLaTeX :: PandocMonad m => [Block] -> LW m Doc +blockListToLaTeX lst = vsep `fmap` mapM blockToLaTeX lst + +tableRowToLaTeX :: PandocMonad m + => Bool + -> [Alignment] + -> [Double] + -> [[Block]] + -> LW m Doc +tableRowToLaTeX header aligns widths cols = do + -- scale factor compensates for extra space between columns + -- so the whole table isn't larger than columnwidth + let scaleFactor = 0.97 ** fromIntegral (length aligns) + let isSimple [Plain _] = True + isSimple [Para _] = True + isSimple [] = True + isSimple _ = False + -- simple tables have to have simple cells: + let widths' = if not (all isSimple cols) + then replicate (length aligns) + (0.97 / fromIntegral (length aligns)) + else map (scaleFactor *) widths + cells <- mapM (tableCellToLaTeX header) $ zip3 widths' aligns cols + return $ hsep (intersperse "&" cells) <> "\\tabularnewline" + +-- For simple latex tables (without minipages or parboxes), +-- we need to go to some lengths to get line breaks working: +-- as LineBreak bs = \vtop{\hbox{\strut as}\hbox{\strut bs}}. +fixLineBreaks :: Block -> Block +fixLineBreaks (Para ils) = Para $ fixLineBreaks' ils +fixLineBreaks (Plain ils) = Plain $ fixLineBreaks' ils +fixLineBreaks x = x + +fixLineBreaks' :: [Inline] -> [Inline] +fixLineBreaks' ils = case splitBy (== LineBreak) ils of + [] -> [] + [xs] -> xs + chunks -> RawInline "tex" "\\vtop{" : + concatMap tohbox chunks ++ + [RawInline "tex" "}"] + where tohbox ys = RawInline "tex" "\\hbox{\\strut " : ys ++ + [RawInline "tex" "}"] + +-- We also change display math to inline math, since display +-- math breaks in simple tables. +displayMathToInline :: Inline -> Inline +displayMathToInline (Math DisplayMath x) = Math InlineMath x +displayMathToInline x = x + +tableCellToLaTeX :: PandocMonad m => Bool -> (Double, Alignment, [Block]) + -> LW m Doc +tableCellToLaTeX _ (0, _, blocks) = + blockListToLaTeX $ walk fixLineBreaks $ walk displayMathToInline blocks +tableCellToLaTeX header (width, align, blocks) = do + modify $ \st -> st{ stInMinipage = True, stNotes = [] } + cellContents <- blockListToLaTeX blocks + notes <- gets stNotes + modify $ \st -> st{ stInMinipage = False, stNotes = [] } + let valign = text $ if header then "[b]" else "[t]" + let halign = case align of + AlignLeft -> "\\raggedright" + AlignRight -> "\\raggedleft" + AlignCenter -> "\\centering" + AlignDefault -> "\\raggedright" + return $ ("\\begin{minipage}" <> valign <> + braces (text (printf "%.2f\\columnwidth" width)) <> + (halign <> "\\strut" <> cr <> cellContents <> "\\strut" <> cr) <> + "\\end{minipage}") $$ + notesToLaTeX notes + +notesToLaTeX :: [Doc] -> Doc +notesToLaTeX [] = empty +notesToLaTeX ns = (case length ns of + n | n > 1 -> "\\addtocounter" <> + braces "footnote" <> + braces (text $ show $ 1 - n) + | otherwise -> empty) + $$ + vcat (intersperse + ("\\addtocounter" <> braces "footnote" <> braces "1") + $ map (\x -> "\\footnotetext" <> braces x) + $ reverse ns) + +listItemToLaTeX :: PandocMonad m => [Block] -> LW m Doc +listItemToLaTeX lst + -- we need to put some text before a header if it's the first + -- element in an item. This will look ugly in LaTeX regardless, but + -- this will keep the typesetter from throwing an error. + | ((Header _ _ _) :_) <- lst = + blockListToLaTeX lst >>= return . (text "\\item ~" $$) . (nest 2) + | otherwise = blockListToLaTeX lst >>= return . (text "\\item" $$) . + (nest 2) + +defListItemToLaTeX :: PandocMonad m => ([Inline], [[Block]]) -> LW m Doc +defListItemToLaTeX (term, defs) = do + term' <- inlineListToLaTeX term + -- put braces around term if it contains an internal link, + -- since otherwise we get bad bracket interactions: \item[\hyperref[..] + let isInternalLink (Link _ _ ('#':_,_)) = True + isInternalLink _ = False + let term'' = if any isInternalLink term + then braces term' + else term' + def' <- liftM vsep $ mapM blockListToLaTeX defs + return $ case defs of + (((Header _ _ _) : _) : _) -> + "\\item" <> brackets term'' <> " ~ " $$ def' + _ -> + "\\item" <> brackets term'' $$ def' + +-- | Craft the section header, inserting the secton reference, if supplied. +sectionHeader :: PandocMonad m + => Bool -- True for unnumbered + -> [Char] + -> Int + -> [Inline] + -> LW m Doc +sectionHeader unnumbered ident level lst = do + txt <- inlineListToLaTeX lst + plain <- stringToLaTeX TextString $ concatMap stringify lst + let removeInvalidInline (Note _) = [] + removeInvalidInline (Span (id', _, _) _) | not (null id') = [] + removeInvalidInline (Image _ _ _) = [] + removeInvalidInline x = [x] + let lstNoNotes = foldr (mappend . (\x -> walkM removeInvalidInline x)) mempty lst + txtNoNotes <- inlineListToLaTeX lstNoNotes + -- footnotes in sections don't work (except for starred variants) + -- unless you specify an optional argument: + -- \section[mysec]{mysec\footnote{blah}} + optional <- if unnumbered || lstNoNotes == lst || lstNoNotes == [] + then return empty + else do + return $ brackets txtNoNotes + let contents = if render Nothing txt == plain + then braces txt + else braces (text "\\texorpdfstring" + <> braces txt + <> braces (text plain)) + book <- gets stBook + opts <- gets stOptions + let topLevelDivision = if book && writerTopLevelDivision opts == TopLevelDefault + then TopLevelChapter + else writerTopLevelDivision opts + beamer <- gets stBeamer + let level' = if beamer && + topLevelDivision `elem` [TopLevelPart, TopLevelChapter] + -- beamer has parts but no chapters + then if level == 1 then -1 else level - 1 + else case topLevelDivision of + TopLevelPart -> level - 2 + TopLevelChapter -> level - 1 + TopLevelSection -> level + TopLevelDefault -> level + let sectionType = case level' of + -1 -> "part" + 0 -> "chapter" + 1 -> "section" + 2 -> "subsection" + 3 -> "subsubsection" + 4 -> "paragraph" + 5 -> "subparagraph" + _ -> "" + inQuote <- gets stInQuote + let prefix = if inQuote && level' >= 4 + then text "\\mbox{}%" + -- needed for \paragraph, \subparagraph in quote environment + -- see http://tex.stackexchange.com/questions/169830/ + else empty + lab <- labelFor ident + let star = if unnumbered && level' < 4 then text "*" else empty + let stuffing = star <> optional <> contents + stuffing' <- hypertarget ident $ text ('\\':sectionType) <> stuffing <> lab + return $ if level' > 5 + then txt + else prefix $$ stuffing' + $$ if unnumbered + then "\\addcontentsline{toc}" <> + braces (text sectionType) <> + braces txtNoNotes + else empty + +hypertarget :: PandocMonad m => String -> Doc -> LW m Doc +hypertarget ident x = do + ref <- text `fmap` toLabel ident + internalLinks <- gets stInternalLinks + return $ + if ident `elem` internalLinks + then text "\\hypertarget" + <> braces ref + <> braces x + else x + +labelFor :: PandocMonad m => String -> LW m Doc +labelFor "" = return empty +labelFor ident = do + ref <- text `fmap` toLabel ident + return $ text "\\label" <> braces ref + +-- | Convert list of inline elements to LaTeX. +inlineListToLaTeX :: PandocMonad m + => [Inline] -- ^ Inlines to convert + -> LW m Doc +inlineListToLaTeX lst = + mapM inlineToLaTeX (fixBreaks $ fixLineInitialSpaces lst) + >>= return . hcat + -- nonbreaking spaces (~) in LaTeX don't work after line breaks, + -- so we turn nbsps after hard breaks to \hspace commands. + -- this is mostly used in verse. + where fixLineInitialSpaces [] = [] + fixLineInitialSpaces (LineBreak : Str s@('\160':_) : xs) = + LineBreak : fixNbsps s ++ fixLineInitialSpaces xs + fixLineInitialSpaces (x:xs) = x : fixLineInitialSpaces xs + fixNbsps s = let (ys,zs) = span (=='\160') s + in replicate (length ys) hspace ++ [Str zs] + hspace = RawInline "latex" "\\hspace*{0.333em}" + -- linebreaks after blank lines cause problems: + fixBreaks [] = [] + fixBreaks ys@(LineBreak : LineBreak : _) = + case span (== LineBreak) ys of + (lbs, rest) -> RawInline "latex" + ("\\\\[" ++ show (length lbs) ++ + "\\baselineskip]") : fixBreaks rest + fixBreaks (y:ys) = y : fixBreaks ys + +isQuoted :: Inline -> Bool +isQuoted (Quoted _ _) = True +isQuoted _ = False + +-- | Convert inline element to LaTeX +inlineToLaTeX :: PandocMonad m + => Inline -- ^ Inline to convert + -> LW m Doc +inlineToLaTeX (Span (id',classes,kvs) ils) = do + ref <- toLabel id' + let linkAnchor = if null id' + then empty + else "\\protect\\hypertarget" <> braces (text ref) <> + braces empty + let cmds = ["textup" | "csl-no-emph" `elem` classes] ++ + ["textnormal" | "csl-no-strong" `elem` classes || + "csl-no-smallcaps" `elem` classes] ++ + ["RL" | ("dir", "rtl") `elem` kvs] ++ + ["LR" | ("dir", "ltr") `elem` kvs] ++ + (case lookup "lang" kvs of + Just lng -> let (l, o) = toPolyglossia $ splitBy (=='-') lng + ops = if null o then "" else ("[" ++ o ++ "]") + in ["text" ++ l ++ ops] + Nothing -> []) + contents <- inlineListToLaTeX ils + return $ linkAnchor <> + if null cmds + then braces contents + else foldr inCmd contents cmds +inlineToLaTeX (Emph lst) = + inlineListToLaTeX lst >>= return . inCmd "emph" +inlineToLaTeX (Strong lst) = + inlineListToLaTeX lst >>= return . inCmd "textbf" +inlineToLaTeX (Strikeout lst) = do + -- we need to protect VERB in an mbox or we get an error + -- see #1294 + contents <- inlineListToLaTeX $ protectCode lst + modify $ \s -> s{ stStrikeout = True } + return $ inCmd "sout" contents +inlineToLaTeX (Superscript lst) = + inlineListToLaTeX lst >>= return . inCmd "textsuperscript" +inlineToLaTeX (Subscript lst) = do + inlineListToLaTeX lst >>= return . inCmd "textsubscript" +inlineToLaTeX (SmallCaps lst) = + inlineListToLaTeX lst >>= return . inCmd "textsc" +inlineToLaTeX (Cite cits lst) = do + st <- get + let opts = stOptions st + case writerCiteMethod opts of + Natbib -> citationsToNatbib cits + Biblatex -> citationsToBiblatex cits + _ -> inlineListToLaTeX lst + +inlineToLaTeX (Code (_,classes,_) str) = do + opts <- gets stOptions + inHeading <- gets stInHeading + case () of + _ | writerListings opts && not inHeading -> listingsCode + | isJust (writerHighlightStyle opts) && not (null classes) + -> highlightCode + | otherwise -> rawCode + where listingsCode = do + let listingsopt = case getListingsLanguage classes of + Just l -> "[language=" ++ mbBraced l ++ "]" + Nothing -> "" + inNote <- gets stInNote + when inNote $ modify $ \s -> s{ stVerbInNote = True } + let chr = case "!\"&'()*,-./:;?@_" \\ str of + (c:_) -> c + [] -> '!' + return $ text $ "\\lstinline" ++ listingsopt ++ [chr] ++ str ++ [chr] + highlightCode = do + case highlight formatLaTeXInline ("",classes,[]) str of + Nothing -> rawCode + Just h -> modify (\st -> st{ stHighlighting = True }) >> + return (text (T.unpack h)) + rawCode = liftM (text . (\s -> "\\texttt{" ++ escapeSpaces s ++ "}")) + $ stringToLaTeX CodeString str + where + escapeSpaces = concatMap (\c -> if c == ' ' then "\\ " else [c]) +inlineToLaTeX (Quoted qt lst) = do + contents <- inlineListToLaTeX lst + csquotes <- liftM stCsquotes get + opts <- gets stOptions + if csquotes + then return $ "\\enquote" <> braces contents + else do + let s1 = if (not (null lst)) && (isQuoted (head lst)) + then "\\," + else empty + let s2 = if (not (null lst)) && (isQuoted (last lst)) + then "\\," + else empty + let inner = s1 <> contents <> s2 + return $ case qt of + DoubleQuote -> + if isEnabled Ext_smart opts + then text "``" <> inner <> text "''" + else char '\x201C' <> inner <> char '\x201D' + SingleQuote -> + if isEnabled Ext_smart opts + then char '`' <> inner <> char '\'' + else char '\x2018' <> inner <> char '\x2019' +inlineToLaTeX (Str str) = liftM text $ stringToLaTeX TextString str +inlineToLaTeX (Math InlineMath str) = + return $ "\\(" <> text str <> "\\)" +inlineToLaTeX (Math DisplayMath str) = + return $ "\\[" <> text str <> "\\]" +inlineToLaTeX il@(RawInline f str) + | f == Format "latex" || f == Format "tex" + = return $ text str + | otherwise = do + report $ InlineNotRendered il + return empty +inlineToLaTeX (LineBreak) = return $ "\\\\" <> cr +inlineToLaTeX SoftBreak = do + wrapText <- gets (writerWrapText . stOptions) + case wrapText of + WrapAuto -> return space + WrapNone -> return space + WrapPreserve -> return cr +inlineToLaTeX Space = return space +inlineToLaTeX (Link _ txt ('#':ident, _)) = do + contents <- inlineListToLaTeX txt + lab <- toLabel ident + return $ text "\\protect\\hyperlink" <> braces (text lab) <> braces contents +inlineToLaTeX (Link _ txt (src, _)) = + case txt of + [Str x] | escapeURI x == src -> -- autolink + do modify $ \s -> s{ stUrl = True } + src' <- stringToLaTeX URLString (escapeURI src) + return $ text $ "\\url{" ++ src' ++ "}" + [Str x] | Just rest <- stripPrefix "mailto:" src, + escapeURI x == rest -> -- email autolink + do modify $ \s -> s{ stUrl = True } + src' <- stringToLaTeX URLString (escapeURI src) + contents <- inlineListToLaTeX txt + return $ "\\href" <> braces (text src') <> + braces ("\\nolinkurl" <> braces contents) + _ -> do contents <- inlineListToLaTeX txt + src' <- stringToLaTeX URLString (escapeURI src) + return $ text ("\\href{" ++ src' ++ "}{") <> + contents <> char '}' +inlineToLaTeX (Image attr _ (source, _)) = do + modify $ \s -> s{ stGraphics = True } + opts <- gets stOptions + let showDim dir = let d = text (show dir) <> "=" + in case (dimension dir attr) of + Just (Pixel a) -> + [d <> text (showInInch opts (Pixel a)) <> "in"] + Just (Percent a) -> + [d <> text (showFl (a / 100)) <> "\\textwidth"] + Just dim -> + [d <> text (show dim)] + Nothing -> + [] + dimList = showDim Width ++ showDim Height + dims = if null dimList + then empty + else brackets $ cat (intersperse "," dimList) + source' = if isURI source + then source + else unEscapeString source + source'' <- stringToLaTeX URLString source' + inHeading <- gets stInHeading + return $ + (if inHeading then "\\protect\\includegraphics" else "\\includegraphics") <> + dims <> braces (text source'') +inlineToLaTeX (Note contents) = do + inMinipage <- gets stInMinipage + modify (\s -> s{stInNote = True}) + contents' <- blockListToLaTeX contents + modify (\s -> s {stInNote = False}) + let optnl = case reverse contents of + (CodeBlock _ _ : _) -> cr + _ -> empty + let noteContents = nest 2 contents' <> optnl + beamer <- gets stBeamer + -- in beamer slides, display footnote from current overlay forward + let beamerMark = if beamer + then text "<.->" + else empty + modify $ \st -> st{ stNotes = noteContents : stNotes st } + return $ + if inMinipage + then "\\footnotemark{}" + -- note: a \n before } needed when note ends with a Verbatim environment + else "\\footnote" <> beamerMark <> braces noteContents + +protectCode :: [Inline] -> [Inline] +protectCode [] = [] +protectCode (x@(Code ("",[],[]) _) : xs) = x : protectCode xs +protectCode (x@(Code _ _) : xs) = ltx "\\mbox{" : x : ltx "}" : xs + where ltx = RawInline (Format "latex") +protectCode (x : xs) = x : protectCode xs + +citationsToNatbib :: PandocMonad m => [Citation] -> LW m Doc +citationsToNatbib (one:[]) + = citeCommand c p s k + where + Citation { citationId = k + , citationPrefix = p + , citationSuffix = s + , citationMode = m + } + = one + c = case m of + AuthorInText -> "citet" + SuppressAuthor -> "citeyearpar" + NormalCitation -> "citep" + +citationsToNatbib cits + | noPrefix (tail cits) && noSuffix (init cits) && ismode NormalCitation cits + = citeCommand "citep" p s ks + where + noPrefix = all (null . citationPrefix) + noSuffix = all (null . citationSuffix) + ismode m = all (((==) m) . citationMode) + p = citationPrefix $ head $ cits + s = citationSuffix $ last $ cits + ks = intercalate ", " $ map citationId cits + +citationsToNatbib (c:cs) | citationMode c == AuthorInText = do + author <- citeCommand "citeauthor" [] [] (citationId c) + cits <- citationsToNatbib (c { citationMode = SuppressAuthor } : cs) + return $ author <+> cits + +citationsToNatbib cits = do + cits' <- mapM convertOne cits + return $ text "\\citetext{" <> foldl' combineTwo empty cits' <> text "}" + where + combineTwo a b | isEmpty a = b + | otherwise = a <> text "; " <> b + convertOne Citation { citationId = k + , citationPrefix = p + , citationSuffix = s + , citationMode = m + } + = case m of + AuthorInText -> citeCommand "citealt" p s k + SuppressAuthor -> citeCommand "citeyear" p s k + NormalCitation -> citeCommand "citealp" p s k + +citeCommand :: PandocMonad m + => String -> [Inline] -> [Inline] -> String -> LW m Doc +citeCommand c p s k = do + args <- citeArguments p s k + return $ text ("\\" ++ c) <> args + +citeArguments :: PandocMonad m + => [Inline] -> [Inline] -> String -> LW m Doc +citeArguments p s k = do + let s' = case s of + (Str (x:[]) : r) | isPunctuation x -> dropWhile (== Space) r + (Str (x:xs) : r) | isPunctuation x -> Str xs : r + _ -> s + pdoc <- inlineListToLaTeX p + sdoc <- inlineListToLaTeX s' + let optargs = case (isEmpty pdoc, isEmpty sdoc) of + (True, True ) -> empty + (True, False) -> brackets sdoc + (_ , _ ) -> brackets pdoc <> brackets sdoc + return $ optargs <> braces (text k) + +citationsToBiblatex :: PandocMonad m => [Citation] -> LW m Doc +citationsToBiblatex (one:[]) + = citeCommand cmd p s k + where + Citation { citationId = k + , citationPrefix = p + , citationSuffix = s + , citationMode = m + } = one + cmd = case m of + SuppressAuthor -> "autocite*" + AuthorInText -> "textcite" + NormalCitation -> "autocite" + +citationsToBiblatex (c:cs) = do + args <- mapM convertOne (c:cs) + return $ text cmd <> foldl' (<>) empty args + where + cmd = case citationMode c of + SuppressAuthor -> "\\autocites*" + AuthorInText -> "\\textcites" + NormalCitation -> "\\autocites" + convertOne Citation { citationId = k + , citationPrefix = p + , citationSuffix = s + } + = citeArguments p s k + +citationsToBiblatex _ = return empty + +-- Determine listings language from list of class attributes. +getListingsLanguage :: [String] -> Maybe String +getListingsLanguage [] = Nothing +getListingsLanguage (x:xs) = toListingsLanguage x <|> getListingsLanguage xs + +mbBraced :: String -> String +mbBraced x = if not (all isAlphaNum x) + then "{" <> x <> "}" + else x + +-- Extract a key from divs and spans +extract :: String -> Block -> [String] +extract key (Div attr _) = lookKey key attr +extract key (Plain ils) = concatMap (extractInline key) ils +extract key (Para ils) = concatMap (extractInline key) ils +extract key (Header _ _ ils) = concatMap (extractInline key) ils +extract _ _ = [] + +-- Extract a key from spans +extractInline :: String -> Inline -> [String] +extractInline key (Span attr _) = lookKey key attr +extractInline _ _ = [] + +-- Look up a key in an attribute and give a list of its values +lookKey :: String -> Attr -> [String] +lookKey key (_,_,kvs) = maybe [] words $ lookup key kvs + +-- In environments \Arabic instead of \arabic is used +toPolyglossiaEnv :: String -> (String, String) +toPolyglossiaEnv l = + case toPolyglossia $ (splitBy (=='-')) l of + ("arabic", o) -> ("Arabic", o) + x -> x + +-- Takes a list of the constituents of a BCP 47 language code and +-- converts it to a Polyglossia (language, options) tuple +-- http://mirrors.ctan.org/macros/latex/contrib/polyglossia/polyglossia.pdf +toPolyglossia :: [String] -> (String, String) +toPolyglossia ("ar":"DZ":_) = ("arabic", "locale=algeria") +toPolyglossia ("ar":"IQ":_) = ("arabic", "locale=mashriq") +toPolyglossia ("ar":"JO":_) = ("arabic", "locale=mashriq") +toPolyglossia ("ar":"LB":_) = ("arabic", "locale=mashriq") +toPolyglossia ("ar":"LY":_) = ("arabic", "locale=libya") +toPolyglossia ("ar":"MA":_) = ("arabic", "locale=morocco") +toPolyglossia ("ar":"MR":_) = ("arabic", "locale=mauritania") +toPolyglossia ("ar":"PS":_) = ("arabic", "locale=mashriq") +toPolyglossia ("ar":"SY":_) = ("arabic", "locale=mashriq") +toPolyglossia ("ar":"TN":_) = ("arabic", "locale=tunisia") +toPolyglossia ("de":"1901":_) = ("german", "spelling=old") +toPolyglossia ("de":"AT":"1901":_) = ("german", "variant=austrian, spelling=old") +toPolyglossia ("de":"AT":_) = ("german", "variant=austrian") +toPolyglossia ("de":"CH":"1901":_) = ("german", "variant=swiss, spelling=old") +toPolyglossia ("de":"CH":_) = ("german", "variant=swiss") +toPolyglossia ("de":_) = ("german", "") +toPolyglossia ("dsb":_) = ("lsorbian", "") +toPolyglossia ("el":"polyton":_) = ("greek", "variant=poly") +toPolyglossia ("en":"AU":_) = ("english", "variant=australian") +toPolyglossia ("en":"CA":_) = ("english", "variant=canadian") +toPolyglossia ("en":"GB":_) = ("english", "variant=british") +toPolyglossia ("en":"NZ":_) = ("english", "variant=newzealand") +toPolyglossia ("en":"UK":_) = ("english", "variant=british") +toPolyglossia ("en":"US":_) = ("english", "variant=american") +toPolyglossia ("grc":_) = ("greek", "variant=ancient") +toPolyglossia ("hsb":_) = ("usorbian", "") +toPolyglossia ("la":"x":"classic":_) = ("latin", "variant=classic") +toPolyglossia ("sl":_) = ("slovenian", "") +toPolyglossia x = (commonFromBcp47 x, "") + +-- Takes a list of the constituents of a BCP 47 language code and +-- converts it to a Babel language string. +-- http://mirrors.ctan.org/macros/latex/required/babel/base/babel.pdf +-- List of supported languages (slightly outdated): +-- http://tug.ctan.org/language/hyph-utf8/doc/generic/hyph-utf8/hyphenation.pdf +toBabel :: [String] -> String +toBabel ("de":"1901":_) = "german" +toBabel ("de":"AT":"1901":_) = "austrian" +toBabel ("de":"AT":_) = "naustrian" +toBabel ("de":"CH":"1901":_) = "swissgerman" +toBabel ("de":"CH":_) = "nswissgerman" +toBabel ("de":_) = "ngerman" +toBabel ("dsb":_) = "lowersorbian" +toBabel ("el":"polyton":_) = "polutonikogreek" +toBabel ("en":"AU":_) = "australian" +toBabel ("en":"CA":_) = "canadian" +toBabel ("en":"GB":_) = "british" +toBabel ("en":"NZ":_) = "newzealand" +toBabel ("en":"UK":_) = "british" +toBabel ("en":"US":_) = "american" +toBabel ("fr":"CA":_) = "canadien" +toBabel ("fra":"aca":_) = "acadian" +toBabel ("grc":_) = "polutonikogreek" +toBabel ("hsb":_) = "uppersorbian" +toBabel ("la":"x":"classic":_) = "classiclatin" +toBabel ("sl":_) = "slovene" +toBabel x = commonFromBcp47 x + +-- Takes a list of the constituents of a BCP 47 language code +-- and converts it to a string shared by Babel and Polyglossia. +-- https://tools.ietf.org/html/bcp47#section-2.1 +commonFromBcp47 :: [String] -> String +commonFromBcp47 [] = "" +commonFromBcp47 ("pt":"BR":_) = "brazil" +-- Note: documentation says "brazilian" works too, but it doesn't seem to work +-- on some systems. See #2953. +commonFromBcp47 ("sr":"Cyrl":_) = "serbianc" +commonFromBcp47 ("zh":"Latn":"pinyin":_) = "pinyin" +commonFromBcp47 x = fromIso $ head x + where + fromIso "af" = "afrikaans" + fromIso "am" = "amharic" + fromIso "ar" = "arabic" + fromIso "as" = "assamese" + fromIso "ast" = "asturian" + fromIso "bg" = "bulgarian" + fromIso "bn" = "bengali" + fromIso "bo" = "tibetan" + fromIso "br" = "breton" + fromIso "ca" = "catalan" + fromIso "cy" = "welsh" + fromIso "cs" = "czech" + fromIso "cop" = "coptic" + fromIso "da" = "danish" + fromIso "dv" = "divehi" + fromIso "el" = "greek" + fromIso "en" = "english" + fromIso "eo" = "esperanto" + fromIso "es" = "spanish" + fromIso "et" = "estonian" + fromIso "eu" = "basque" + fromIso "fa" = "farsi" + fromIso "fi" = "finnish" + fromIso "fr" = "french" + fromIso "fur" = "friulan" + fromIso "ga" = "irish" + fromIso "gd" = "scottish" + fromIso "gez" = "ethiopic" + fromIso "gl" = "galician" + fromIso "he" = "hebrew" + fromIso "hi" = "hindi" + fromIso "hr" = "croatian" + fromIso "hu" = "magyar" + fromIso "hy" = "armenian" + fromIso "ia" = "interlingua" + fromIso "id" = "indonesian" + fromIso "ie" = "interlingua" + fromIso "is" = "icelandic" + fromIso "it" = "italian" + fromIso "jp" = "japanese" + fromIso "km" = "khmer" + fromIso "kmr" = "kurmanji" + fromIso "kn" = "kannada" + fromIso "ko" = "korean" + fromIso "la" = "latin" + fromIso "lo" = "lao" + fromIso "lt" = "lithuanian" + fromIso "lv" = "latvian" + fromIso "ml" = "malayalam" + fromIso "mn" = "mongolian" + fromIso "mr" = "marathi" + fromIso "nb" = "norsk" + fromIso "nl" = "dutch" + fromIso "nn" = "nynorsk" + fromIso "no" = "norsk" + fromIso "nqo" = "nko" + fromIso "oc" = "occitan" + fromIso "pa" = "panjabi" + fromIso "pl" = "polish" + fromIso "pms" = "piedmontese" + fromIso "pt" = "portuguese" + fromIso "rm" = "romansh" + fromIso "ro" = "romanian" + fromIso "ru" = "russian" + fromIso "sa" = "sanskrit" + fromIso "se" = "samin" + fromIso "sk" = "slovak" + fromIso "sq" = "albanian" + fromIso "sr" = "serbian" + fromIso "sv" = "swedish" + fromIso "syr" = "syriac" + fromIso "ta" = "tamil" + fromIso "te" = "telugu" + fromIso "th" = "thai" + fromIso "ti" = "ethiopic" + fromIso "tk" = "turkmen" + fromIso "tr" = "turkish" + fromIso "uk" = "ukrainian" + fromIso "ur" = "urdu" + fromIso "vi" = "vietnamese" + fromIso _ = "" + +pDocumentOptions :: P.Parsec String () [String] +pDocumentOptions = do + P.char '[' + opts <- P.sepBy + (P.many $ P.spaces *> P.noneOf (" ,]" :: String) <* P.spaces) + (P.char ',') + P.char ']' + return opts + +pDocumentClass :: P.Parsec String () String +pDocumentClass = + do P.skipMany (P.satisfy (/='\\')) + P.string "\\documentclass" + classOptions <- pDocumentOptions <|> return [] + if ("article" :: String) `elem` classOptions + then return "article" + else do P.skipMany (P.satisfy (/='{')) + P.char '{' + P.manyTill P.letter (P.char '}') diff --git a/src/Text/Pandoc/Writers/Man.hs b/src/Text/Pandoc/Writers/Man.hs new file mode 100644 index 000000000..f33acef32 --- /dev/null +++ b/src/Text/Pandoc/Writers/Man.hs @@ -0,0 +1,381 @@ +{- +Copyright (C) 2007-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Man + Copyright : Copyright (C) 2007-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to groff man page format. + +-} +module Text.Pandoc.Writers.Man ( writeMan) where +import Text.Pandoc.Definition +import Text.Pandoc.Templates +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import Text.Pandoc.Writers.Math +import Text.Printf ( printf ) +import Data.List ( stripPrefix, intersperse, intercalate ) +import Data.Maybe (fromMaybe) +import Text.Pandoc.Pretty +import Text.Pandoc.Builder (deleteMeta) +import Control.Monad.State +import Text.Pandoc.Error +import Control.Monad.Except (throwError) +import Text.Pandoc.Class (PandocMonad, report) +import Text.Pandoc.Logging + +type Notes = [[Block]] +data WriterState = WriterState { stNotes :: Notes + , stHasTables :: Bool } + +-- | Convert Pandoc to Man. +writeMan :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeMan opts document = evalStateT (pandocToMan opts document) (WriterState [] False) + +-- | Return groff man representation of document. +pandocToMan :: PandocMonad m => WriterOptions -> Pandoc -> StateT WriterState m String +pandocToMan opts (Pandoc meta blocks) = do + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + let render' = render colwidth + titleText <- inlineListToMan opts $ docTitle meta + let title' = render' titleText + let setFieldsFromTitle = + case break (== ' ') title' of + (cmdName, rest) -> case break (=='(') cmdName of + (xs, '(':ys) | not (null ys) && + last ys == ')' -> + defField "title" xs . + defField "section" (init ys) . + case splitBy (=='|') rest of + (ft:hds) -> + defField "footer" (trim ft) . + defField "header" + (trim $ concat hds) + [] -> id + _ -> defField "title" title' + metadata <- metaToJSON opts + (fmap (render colwidth) . blockListToMan opts) + (fmap (render colwidth) . inlineListToMan opts) + $ deleteMeta "title" meta + body <- blockListToMan opts blocks + notes <- liftM stNotes get + notes' <- notesToMan opts (reverse notes) + let main = render' $ body $$ notes' $$ text "" + hasTables <- liftM stHasTables get + let context = defField "body" main + $ setFieldsFromTitle + $ defField "has-tables" hasTables + $ defField "hyphenate" True + $ defField "pandoc-version" pandocVersion + $ metadata + case writerTemplate opts of + Nothing -> return main + Just tpl -> return $ renderTemplate' tpl context + +-- | Return man representation of notes. +notesToMan :: PandocMonad m => WriterOptions -> [[Block]] -> StateT WriterState m Doc +notesToMan opts notes = + if null notes + then return empty + else mapM (\(num, note) -> noteToMan opts num note) (zip [1..] notes) >>= + return . (text ".SH NOTES" $$) . vcat + +-- | Return man representation of a note. +noteToMan :: PandocMonad m => WriterOptions -> Int -> [Block] -> StateT WriterState m Doc +noteToMan opts num note = do + contents <- blockListToMan opts note + let marker = cr <> text ".SS " <> brackets (text (show num)) + return $ marker $$ contents + +-- | Association list of characters to escape. +manEscapes :: [(Char, String)] +manEscapes = [ ('\160', "\\ ") + , ('\'', "\\[aq]") + , ('’', "'") + , ('\x2014', "\\[em]") + , ('\x2013', "\\[en]") + , ('\x2026', "\\&...") + ] ++ backslashEscapes "-@\\" + +-- | Escape special characters for Man. +escapeString :: String -> String +escapeString = escapeStringUsing manEscapes + +-- | Escape a literal (code) section for Man. +escapeCode :: String -> String +escapeCode = concat . intersperse "\n" . map escapeLine . lines where + escapeLine codeline = + case escapeStringUsing (manEscapes ++ backslashEscapes "\t ") codeline of + a@('.':_) -> "\\&" ++ a + b -> b + +-- We split inline lists into sentences, and print one sentence per +-- line. groff/troff treats the line-ending period differently. +-- See http://code.google.com/p/pandoc/issues/detail?id=148. + +-- | Returns the first sentence in a list of inlines, and the rest. +breakSentence :: [Inline] -> ([Inline], [Inline]) +breakSentence [] = ([],[]) +breakSentence xs = + let isSentenceEndInline (Str ys@(_:_)) | last ys == '.' = True + isSentenceEndInline (Str ys@(_:_)) | last ys == '?' = True + isSentenceEndInline (LineBreak) = True + isSentenceEndInline _ = False + (as, bs) = break isSentenceEndInline xs + in case bs of + [] -> (as, []) + [c] -> (as ++ [c], []) + (c:Space:cs) -> (as ++ [c], cs) + (c:SoftBreak:cs) -> (as ++ [c], cs) + (Str ".":Str (')':ys):cs) -> (as ++ [Str ".", Str (')':ys)], cs) + (x@(Str ('.':')':_)):cs) -> (as ++ [x], cs) + (LineBreak:x@(Str ('.':_)):cs) -> (as ++[LineBreak], x:cs) + (c:cs) -> (as ++ [c] ++ ds, es) + where (ds, es) = breakSentence cs + +-- | Split a list of inlines into sentences. +splitSentences :: [Inline] -> [[Inline]] +splitSentences xs = + let (sent, rest) = breakSentence xs + in if null rest then [sent] else sent : splitSentences rest + +-- | Convert Pandoc block element to man. +blockToMan :: PandocMonad m + => WriterOptions -- ^ Options + -> Block -- ^ Block element + -> StateT WriterState m Doc +blockToMan _ Null = return empty +blockToMan opts (Div _ bs) = blockListToMan opts bs +blockToMan opts (Plain inlines) = + liftM vcat $ mapM (inlineListToMan opts) $ splitSentences inlines +blockToMan opts (Para inlines) = do + contents <- liftM vcat $ mapM (inlineListToMan opts) $ + splitSentences inlines + return $ text ".PP" $$ contents +blockToMan opts (LineBlock lns) = + blockToMan opts $ linesToPara lns +blockToMan _ b@(RawBlock f str) + | f == Format "man" = return $ text str + | otherwise = do + report $ BlockNotRendered b + return empty +blockToMan _ HorizontalRule = return $ text ".PP" $$ text " * * * * *" +blockToMan opts (Header level _ inlines) = do + contents <- inlineListToMan opts inlines + let heading = case level of + 1 -> ".SH " + _ -> ".SS " + return $ text heading <> contents +blockToMan _ (CodeBlock _ str) = return $ + text ".IP" $$ + text ".nf" $$ + text "\\f[C]" $$ + text (escapeCode str) $$ + text "\\f[]" $$ + text ".fi" +blockToMan opts (BlockQuote blocks) = do + contents <- blockListToMan opts blocks + return $ text ".RS" $$ contents $$ text ".RE" +blockToMan opts (Table caption alignments widths headers rows) = + let aligncode AlignLeft = "l" + aligncode AlignRight = "r" + aligncode AlignCenter = "c" + aligncode AlignDefault = "l" + in do + caption' <- inlineListToMan opts caption + modify $ \st -> st{ stHasTables = True } + let iwidths = if all (== 0) widths + then repeat "" + else map (printf "w(%0.1fn)" . (70 *)) widths + -- 78n default width - 8n indent = 70n + let coldescriptions = text $ intercalate " " + (zipWith (\align width -> aligncode align ++ width) + alignments iwidths) ++ "." + colheadings <- mapM (blockListToMan opts) headers + let makeRow cols = text "T{" $$ + (vcat $ intersperse (text "T}@T{") cols) $$ + text "T}" + let colheadings' = if all null headers + then empty + else makeRow colheadings $$ char '_' + body <- mapM (\row -> do + cols <- mapM (blockListToMan opts) row + return $ makeRow cols) rows + return $ text ".PP" $$ caption' $$ + text ".TS" $$ text "tab(@);" $$ coldescriptions $$ + colheadings' $$ vcat body $$ text ".TE" + +blockToMan opts (BulletList items) = do + contents <- mapM (bulletListItemToMan opts) items + return (vcat contents) +blockToMan opts (OrderedList attribs items) = do + let markers = take (length items) $ orderedListMarkers attribs + let indent = 1 + (maximum $ map length markers) + contents <- mapM (\(num, item) -> orderedListItemToMan opts num indent item) $ + zip markers items + return (vcat contents) +blockToMan opts (DefinitionList items) = do + contents <- mapM (definitionListItemToMan opts) items + return (vcat contents) + +-- | Convert bullet list item (list of blocks) to man. +bulletListItemToMan :: PandocMonad m => WriterOptions -> [Block] -> StateT WriterState m Doc +bulletListItemToMan _ [] = return empty +bulletListItemToMan opts ((Para first):rest) = + bulletListItemToMan opts ((Plain first):rest) +bulletListItemToMan opts ((Plain first):rest) = do + first' <- blockToMan opts (Plain first) + rest' <- blockListToMan opts rest + let first'' = text ".IP \\[bu] 2" $$ first' + let rest'' = if null rest + then empty + else text ".RS 2" $$ rest' $$ text ".RE" + return (first'' $$ rest'') +bulletListItemToMan opts (first:rest) = do + first' <- blockToMan opts first + rest' <- blockListToMan opts rest + return $ text "\\[bu] .RS 2" $$ first' $$ rest' $$ text ".RE" + +-- | Convert ordered list item (a list of blocks) to man. +orderedListItemToMan :: PandocMonad m + => WriterOptions -- ^ options + -> String -- ^ order marker for list item + -> Int -- ^ number of spaces to indent + -> [Block] -- ^ list item (list of blocks) + -> StateT WriterState m Doc +orderedListItemToMan _ _ _ [] = return empty +orderedListItemToMan opts num indent ((Para first):rest) = + orderedListItemToMan opts num indent ((Plain first):rest) +orderedListItemToMan opts num indent (first:rest) = do + first' <- blockToMan opts first + rest' <- blockListToMan opts rest + let num' = printf ("%" ++ show (indent - 1) ++ "s") num + let first'' = text (".IP \"" ++ num' ++ "\" " ++ show indent) $$ first' + let rest'' = if null rest + then empty + else text ".RS 4" $$ rest' $$ text ".RE" + return $ first'' $$ rest'' + +-- | Convert definition list item (label, list of blocks) to man. +definitionListItemToMan :: PandocMonad m + => WriterOptions + -> ([Inline],[[Block]]) + -> StateT WriterState m Doc +definitionListItemToMan opts (label, defs) = do + labelText <- inlineListToMan opts label + contents <- if null defs + then return empty + else liftM vcat $ forM defs $ \blocks -> do + (first, rest) <- case blocks of + ((Para x):y) -> return (Plain x,y) + (x:y) -> return (x,y) + [] -> throwError $ PandocSomeError "blocks is null" + rest' <- liftM vcat $ + mapM (\item -> blockToMan opts item) rest + first' <- blockToMan opts first + return $ first' $$ text ".RS" $$ rest' $$ text ".RE" + return $ text ".TP" $$ nowrap (text ".B " <> labelText) $$ contents + +-- | Convert list of Pandoc block elements to man. +blockListToMan :: PandocMonad m + => WriterOptions -- ^ Options + -> [Block] -- ^ List of block elements + -> StateT WriterState m Doc +blockListToMan opts blocks = + mapM (blockToMan opts) blocks >>= (return . vcat) + +-- | Convert list of Pandoc inline elements to man. +inlineListToMan :: PandocMonad m => WriterOptions -> [Inline] -> StateT WriterState m Doc +inlineListToMan opts lst = mapM (inlineToMan opts) lst >>= (return . hcat) + +-- | Convert Pandoc inline element to man. +inlineToMan :: PandocMonad m => WriterOptions -> Inline -> StateT WriterState m Doc +inlineToMan opts (Span _ ils) = inlineListToMan opts ils +inlineToMan opts (Emph lst) = do + contents <- inlineListToMan opts lst + return $ text "\\f[I]" <> contents <> text "\\f[]" +inlineToMan opts (Strong lst) = do + contents <- inlineListToMan opts lst + return $ text "\\f[B]" <> contents <> text "\\f[]" +inlineToMan opts (Strikeout lst) = do + contents <- inlineListToMan opts lst + return $ text "[STRIKEOUT:" <> contents <> char ']' +inlineToMan opts (Superscript lst) = do + contents <- inlineListToMan opts lst + return $ char '^' <> contents <> char '^' +inlineToMan opts (Subscript lst) = do + contents <- inlineListToMan opts lst + return $ char '~' <> contents <> char '~' +inlineToMan opts (SmallCaps lst) = inlineListToMan opts lst -- not supported +inlineToMan opts (Quoted SingleQuote lst) = do + contents <- inlineListToMan opts lst + return $ char '`' <> contents <> char '\'' +inlineToMan opts (Quoted DoubleQuote lst) = do + contents <- inlineListToMan opts lst + return $ text "\\[lq]" <> contents <> text "\\[rq]" +inlineToMan opts (Cite _ lst) = + inlineListToMan opts lst +inlineToMan _ (Code _ str) = + return $ text $ "\\f[C]" ++ escapeCode str ++ "\\f[]" +inlineToMan _ (Str str@('.':_)) = + return $ afterBreak "\\&" <> text (escapeString str) +inlineToMan _ (Str str) = return $ text $ escapeString str +inlineToMan opts (Math InlineMath str) = + lift (texMathToInlines InlineMath str) >>= inlineListToMan opts +inlineToMan opts (Math DisplayMath str) = do + contents <- lift (texMathToInlines DisplayMath str) >>= inlineListToMan opts + return $ cr <> text ".RS" $$ contents $$ text ".RE" +inlineToMan _ il@(RawInline f str) + | f == Format "man" = return $ text str + | otherwise = do + report $ InlineNotRendered il + return empty +inlineToMan _ LineBreak = return $ + cr <> text ".PD 0" $$ text ".P" $$ text ".PD" <> cr +inlineToMan _ SoftBreak = return space +inlineToMan _ Space = return space +inlineToMan opts (Link _ txt (src, _)) = do + linktext <- inlineListToMan opts txt + let srcSuffix = fromMaybe src (stripPrefix "mailto:" src) + return $ case txt of + [Str s] + | escapeURI s == srcSuffix -> + char '<' <> text srcSuffix <> char '>' + _ -> linktext <> text " (" <> text src <> char ')' +inlineToMan opts (Image attr alternate (source, tit)) = do + let txt = if (null alternate) || (alternate == [Str ""]) || + (alternate == [Str source]) -- to prevent autolinks + then [Str "image"] + else alternate + linkPart <- inlineToMan opts (Link attr txt (source, tit)) + return $ char '[' <> text "IMAGE: " <> linkPart <> char ']' +inlineToMan _ (Note contents) = do + -- add to notes in state + modify $ \st -> st{ stNotes = contents : stNotes st } + notes <- liftM stNotes get + let ref = show $ (length notes) + return $ char '[' <> text ref <> char ']' diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs new file mode 100644 index 000000000..a97c32542 --- /dev/null +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -0,0 +1,1147 @@ +{-# LANGUAGE OverloadedStrings, TupleSections, ScopedTypeVariables, MultiWayIf #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Markdown + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to markdown-formatted plain text. + +Markdown: <http://daringfireball.net/projects/markdown/> +-} +module Text.Pandoc.Writers.Markdown (writeMarkdown, writePlain) where +import Text.Pandoc.Definition +import Text.Pandoc.Walk +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (blankline, blanklines, char, space) +import Data.Maybe (fromMaybe) +import Data.Monoid (Any(..)) +import Data.List ( group, stripPrefix, find, intersperse, transpose, sortBy ) +import Data.Char ( isSpace, isPunctuation, ord, chr ) +import Data.Ord ( comparing ) +import Text.Pandoc.Pretty +import Control.Monad.Reader +import Control.Monad.State +import Control.Monad.Except (throwError) +import Text.Pandoc.Writers.HTML (writeHtml5String) +import Text.Pandoc.Writers.Math (texMathToInlines) +import Text.HTML.TagSoup (parseTags, isTagText, Tag(..)) +import Network.URI (isURI) +import Data.Default +import Data.Yaml (Value(Object,String,Array,Bool,Number)) +import qualified Data.HashMap.Strict as H +import qualified Data.Vector as V +import qualified Data.Text as T +import qualified Data.Set as Set +import Network.HTTP ( urlEncode ) +import Text.Pandoc.Error +import Text.Pandoc.Class (PandocMonad, report) +import Text.Pandoc.Logging + +type Notes = [[Block]] +type Ref = ([Inline], Target, Attr) +type Refs = [Ref] + +type MD m = ReaderT WriterEnv (StateT WriterState m) + +evalMD :: PandocMonad m => MD m a -> WriterEnv -> WriterState -> m a +evalMD md env st = evalStateT (runReaderT md env) st + +data WriterEnv = WriterEnv { envInList :: Bool + , envPlain :: Bool + , envRefShortcutable :: Bool + , envBlockLevel :: Int + , envEscapeSpaces :: Bool + } + +instance Default WriterEnv + where def = WriterEnv { envInList = False + , envPlain = False + , envRefShortcutable = True + , envBlockLevel = 0 + , envEscapeSpaces = False + } + +data WriterState = WriterState { stNotes :: Notes + , stRefs :: Refs + , stIds :: Set.Set String + , stNoteNum :: Int + } + +instance Default WriterState + where def = WriterState{ stNotes = [] + , stRefs = [] + , stIds = Set.empty + , stNoteNum = 1 + } + +-- | Convert Pandoc to Markdown. +writeMarkdown :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeMarkdown opts document = + evalMD (pandocToMarkdown opts{ + writerWrapText = if isEnabled Ext_hard_line_breaks opts + then WrapNone + else writerWrapText opts } + document) def def + +-- | Convert Pandoc to plain text (like markdown, but without links, +-- pictures, or inline formatting). +writePlain :: PandocMonad m => WriterOptions -> Pandoc -> m String +writePlain opts document = + evalMD (pandocToMarkdown opts document) def{ envPlain = True } def + +pandocTitleBlock :: Doc -> [Doc] -> Doc -> Doc +pandocTitleBlock tit auths dat = + hang 2 (text "% ") tit <> cr <> + hang 2 (text "% ") (vcat $ map nowrap auths) <> cr <> + hang 2 (text "% ") dat <> cr + +mmdTitleBlock :: Value -> Doc +mmdTitleBlock (Object hashmap) = + vcat $ map go $ sortBy (comparing fst) $ H.toList hashmap + where go (k,v) = + case (text (T.unpack k), v) of + (k', Array vec) + | V.null vec -> empty + | otherwise -> k' <> ":" <> space <> + hcat (intersperse "; " + (map fromstr $ V.toList vec)) + (_, String "") -> empty + (k', x) -> k' <> ":" <> space <> nest 2 (fromstr x) + fromstr (String s) = text (removeBlankLines $ T.unpack s) + fromstr (Bool b) = text (show b) + fromstr (Number n) = text (show n) + fromstr _ = empty + -- blank lines not allowed in MMD metadata - we replace with . + removeBlankLines = trimr . unlines . map (\x -> + if all isSpace x then "." else x) . lines +mmdTitleBlock _ = empty + +plainTitleBlock :: Doc -> [Doc] -> Doc -> Doc +plainTitleBlock tit auths dat = + tit <> cr <> + (hcat (intersperse (text "; ") auths)) <> cr <> + dat <> cr + +yamlMetadataBlock :: Value -> Doc +yamlMetadataBlock v = "---" $$ (jsonToYaml v) $$ "---" + +jsonToYaml :: Value -> Doc +jsonToYaml (Object hashmap) = + vcat $ map (\(k,v) -> + case (text (T.unpack k), v, jsonToYaml v) of + (k', Array vec, x) + | V.null vec -> empty + | otherwise -> (k' <> ":") $$ x + (k', Object _, x) -> (k' <> ":") $$ nest 2 x + (_, String "", _) -> empty + (k', _, x) | k == "meta-json" -> empty + | otherwise -> k' <> ":" <> space <> hang 2 "" x) + $ sortBy (comparing fst) $ H.toList hashmap +jsonToYaml (Array vec) = + vcat $ map (\v -> hang 2 "- " (jsonToYaml v)) $ V.toList vec +jsonToYaml (String "") = empty +jsonToYaml (String s) = + case T.unpack s of + x | '\n' `elem` x -> hang 2 ("|" <> cr) $ text x + | not (any isPunctuation x) -> text x + | otherwise -> text $ "'" ++ substitute "'" "''" x ++ "'" +jsonToYaml (Bool b) = text $ show b +jsonToYaml (Number n) = text $ show n +jsonToYaml _ = empty + +-- | Return markdown representation of document. +pandocToMarkdown :: PandocMonad m => WriterOptions -> Pandoc -> MD m String +pandocToMarkdown opts (Pandoc meta blocks) = do + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + isPlain <- asks envPlain + metadata <- metaToJSON opts + (fmap (render colwidth) . blockListToMarkdown opts) + (fmap (render colwidth) . inlineListToMarkdown opts) + meta + let title' = maybe empty text $ getField "title" metadata + let authors' = maybe [] (map text) $ getField "author" metadata + let date' = maybe empty text $ getField "date" metadata + let titleblock = case writerTemplate opts of + Just _ | isPlain -> + plainTitleBlock title' authors' date' + | isEnabled Ext_yaml_metadata_block opts -> + yamlMetadataBlock metadata + | isEnabled Ext_pandoc_title_block opts -> + pandocTitleBlock title' authors' date' + | isEnabled Ext_mmd_title_block opts -> + mmdTitleBlock metadata + | otherwise -> empty + Nothing -> empty + let headerBlocks = filter isHeaderBlock blocks + toc <- if writerTableOfContents opts + then tableOfContents opts headerBlocks + else return empty + -- Strip off final 'references' header if markdown citations enabled + let blocks' = if isEnabled Ext_citations opts + then case reverse blocks of + (Div (_,["references"],_) _):xs -> reverse xs + _ -> blocks + else blocks + body <- blockListToMarkdown opts blocks' + notesAndRefs' <- notesAndRefs opts + let render' :: Doc -> String + render' = render colwidth + let main = render' $ body <> notesAndRefs' + let context = defField "toc" (render' toc) + $ defField "body" main + $ (if isNullMeta meta + then id + else defField "titleblock" (render' titleblock)) + $ metadata + case writerTemplate opts of + Nothing -> return main + Just tpl -> return $ renderTemplate' tpl context + +-- | Return markdown representation of reference key table. +refsToMarkdown :: PandocMonad m => WriterOptions -> Refs -> MD m Doc +refsToMarkdown opts refs = mapM (keyToMarkdown opts) refs >>= return . vcat + +-- | Return markdown representation of a reference key. +keyToMarkdown :: PandocMonad m + => WriterOptions + -> Ref + -> MD m Doc +keyToMarkdown opts (label, (src, tit), attr) = do + label' <- inlineListToMarkdown opts label + let tit' = if null tit + then empty + else space <> "\"" <> text tit <> "\"" + return $ nest 2 $ hang 2 + ("[" <> label' <> "]:" <> space) (text src <> tit') + <> linkAttributes opts attr + +-- | Return markdown representation of notes. +notesToMarkdown :: PandocMonad m => WriterOptions -> [[Block]] -> MD m Doc +notesToMarkdown opts notes = do + n <- gets stNoteNum + notes' <- mapM (\(num, note) -> noteToMarkdown opts num note) (zip [n..] notes) + modify $ \st -> st { stNoteNum = stNoteNum st + length notes } + return $ vsep notes' + +-- | Return markdown representation of a note. +noteToMarkdown :: PandocMonad m => WriterOptions -> Int -> [Block] -> MD m Doc +noteToMarkdown opts num blocks = do + contents <- blockListToMarkdown opts blocks + let num' = text $ writerIdentifierPrefix opts ++ show num + let marker = if isEnabled Ext_footnotes opts + then text "[^" <> num' <> text "]:" + else text "[" <> num' <> text "]" + let markerSize = 4 + offset num' + let spacer = case writerTabStop opts - markerSize of + n | n > 0 -> text $ replicate n ' ' + _ -> text " " + return $ if isEnabled Ext_footnotes opts + then hang (writerTabStop opts) (marker <> spacer) contents + else marker <> spacer <> contents + +-- | Escape special characters for Markdown. +escapeString :: WriterOptions -> String -> String +escapeString _ [] = [] +escapeString opts (c:cs) = + case c of + '<' -> "<" ++ escapeString opts cs + '>' -> ">" ++ escapeString opts cs + _ | c `elem` ['\\','`','*','_','[',']','#'] -> + '\\':c:escapeString opts cs + '^' | isEnabled Ext_superscript opts -> '\\':'^':escapeString opts cs + '~' | isEnabled Ext_subscript opts -> '\\':'~':escapeString opts cs + '$' | isEnabled Ext_tex_math_dollars opts -> '\\':'$':escapeString opts cs + '\'' | isEnabled Ext_smart opts -> '\\':'\'':escapeString opts cs + '"' | isEnabled Ext_smart opts -> '\\':'"':escapeString opts cs + '-' | isEnabled Ext_smart opts -> + case cs of + '-':_ -> '\\':'-':escapeString opts cs + _ -> '-':escapeString opts cs + '.' | isEnabled Ext_smart opts -> + case cs of + '.':'.':rest -> '\\':'.':'.':'.':escapeString opts rest + _ -> '.':escapeString opts cs + _ -> c : escapeString opts cs + +-- | Construct table of contents from list of header blocks. +tableOfContents :: PandocMonad m => WriterOptions -> [Block] -> m Doc +tableOfContents opts headers = + let contents = BulletList $ map (elementToListItem opts) $ hierarchicalize headers + in evalMD (blockToMarkdown opts contents) def def + +-- | Converts an Element to a list item for a table of contents, +elementToListItem :: WriterOptions -> Element -> [Block] +elementToListItem opts (Sec lev _nums (ident,_,_) headerText subsecs) + = Plain headerLink : + [ BulletList (map (elementToListItem opts) subsecs) | + not (null subsecs) && lev < writerTOCDepth opts ] + where headerLink = if null ident + then walk deNote headerText + else [Link nullAttr (walk deNote headerText) + ('#':ident, "")] +elementToListItem _ (Blk _) = [] + +attrsToMarkdown :: Attr -> Doc +attrsToMarkdown attribs = braces $ hsep [attribId, attribClasses, attribKeys] + where attribId = case attribs of + ([],_,_) -> empty + (i,_,_) -> "#" <> text i + attribClasses = case attribs of + (_,[],_) -> empty + (_,cs,_) -> hsep $ + map (text . ('.':)) + cs + attribKeys = case attribs of + (_,_,[]) -> empty + (_,_,ks) -> hsep $ + map (\(k,v) -> text k + <> "=\"" <> text v <> "\"") ks + +linkAttributes :: WriterOptions -> Attr -> Doc +linkAttributes opts attr = + if isEnabled Ext_link_attributes opts && attr /= nullAttr + then attrsToMarkdown attr + else empty + +-- | Ordered list start parser for use in Para below. +olMarker :: Parser [Char] ParserState Char +olMarker = do (start, style', delim) <- anyOrderedListMarker + if delim == Period && + (style' == UpperAlpha || (style' == UpperRoman && + start `elem` [1, 5, 10, 50, 100, 500, 1000])) + then spaceChar >> spaceChar + else spaceChar + +-- | True if string begins with an ordered list marker +beginsWithOrderedListMarker :: String -> Bool +beginsWithOrderedListMarker str = + case runParser olMarker defaultParserState "para start" (take 10 str) of + Left _ -> False + Right _ -> True + +notesAndRefs :: PandocMonad m => WriterOptions -> MD m Doc +notesAndRefs opts = do + notes' <- reverse <$> gets stNotes >>= notesToMarkdown opts + modify $ \s -> s { stNotes = [] } + refs' <- reverse <$> gets stRefs >>= refsToMarkdown opts + modify $ \s -> s { stRefs = [] } + + let endSpacing = + if | writerReferenceLocation opts == EndOfDocument -> empty + | isEmpty notes' && isEmpty refs' -> empty + | otherwise -> blankline + + return $ + (if isEmpty notes' then empty else blankline <> notes') <> + (if isEmpty refs' then empty else blankline <> refs') <> + endSpacing + +-- | Convert Pandoc block element to markdown. +blockToMarkdown :: PandocMonad m + => WriterOptions -- ^ Options + -> Block -- ^ Block element + -> MD m Doc +blockToMarkdown opts blk = + local (\env -> env {envBlockLevel = envBlockLevel env + 1}) $ + do doc <- blockToMarkdown' opts blk + blkLevel <- asks envBlockLevel + if writerReferenceLocation opts == EndOfBlock && blkLevel == 1 + then notesAndRefs opts >>= (\d -> return $ doc <> d) + else return doc + +blockToMarkdown' :: PandocMonad m + => WriterOptions -- ^ Options + -> Block -- ^ Block element + -> MD m Doc +blockToMarkdown' _ Null = return empty +blockToMarkdown' opts (Div attrs ils) = do + contents <- blockListToMarkdown opts ils + return $ if isEnabled Ext_raw_html opts && + isEnabled Ext_markdown_in_html_blocks opts + then tagWithAttrs "div" attrs <> blankline <> + contents <> blankline <> "</div>" <> blankline + else contents <> blankline +blockToMarkdown' opts (Plain inlines) = do + contents <- inlineListToMarkdown opts inlines + -- escape if para starts with ordered list marker + isPlain <- asks envPlain + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + let rendered = render colwidth contents + let escapeDelimiter (x:xs) | x `elem` (".()" :: String) = '\\':x:xs + | otherwise = x : escapeDelimiter xs + escapeDelimiter [] = [] + let contents' = if isEnabled Ext_all_symbols_escapable opts && + not isPlain && beginsWithOrderedListMarker rendered + then text $ escapeDelimiter rendered + else contents + return $ contents' <> cr +-- title beginning with fig: indicates figure +blockToMarkdown' opts (Para [Image attr alt (src,'f':'i':'g':':':tit)]) = + blockToMarkdown opts (Para [Image attr alt (src,tit)]) +blockToMarkdown' opts (Para inlines) = + (<> blankline) `fmap` blockToMarkdown opts (Plain inlines) +blockToMarkdown' opts (LineBlock lns) = + if isEnabled Ext_line_blocks opts + then do + mdLines <- mapM (inlineListToMarkdown opts) lns + return $ (vcat $ map (hang 2 (text "| ")) mdLines) <> blankline + else blockToMarkdown opts $ linesToPara lns +blockToMarkdown' opts b@(RawBlock f str) + | f == "markdown" = return $ text str <> text "\n" + | f == "html" && isEnabled Ext_raw_html opts = do + plain <- asks envPlain + return $ if plain + then empty + else if isEnabled Ext_markdown_attribute opts + then text (addMarkdownAttribute str) <> text "\n" + else text str <> text "\n" + | f `elem` ["latex", "tex"] && isEnabled Ext_raw_tex opts = do + plain <- asks envPlain + return $ if plain + then empty + else text str <> text "\n" + | otherwise = do + report $ BlockNotRendered b + return empty +blockToMarkdown' opts HorizontalRule = do + return $ blankline <> text (replicate (writerColumns opts) '-') <> blankline +blockToMarkdown' opts (Header level attr inlines) = do + -- first, if we're putting references at the end of a section, we + -- put them here. + blkLevel <- asks envBlockLevel + refs <- if writerReferenceLocation opts == EndOfSection && blkLevel == 1 + then notesAndRefs opts + else return empty + + plain <- asks envPlain + -- we calculate the id that would be used by auto_identifiers + -- so we know whether to print an explicit identifier + ids <- gets stIds + let autoId = uniqueIdent inlines ids + modify $ \st -> st{ stIds = Set.insert autoId ids } + let attr' = case attr of + ("",[],[]) -> empty + (id',[],[]) | isEnabled Ext_auto_identifiers opts + && id' == autoId -> empty + (id',_,_) | isEnabled Ext_mmd_header_identifiers opts -> + space <> brackets (text id') + _ | isEnabled Ext_header_attributes opts -> + space <> attrsToMarkdown attr + | otherwise -> empty + contents <- inlineListToMarkdown opts $ + if level == 1 && plain + then capitalize inlines + else inlines + let setext = writerSetextHeaders opts + hdr = nowrap $ case level of + 1 | plain -> blanklines 3 <> contents <> blanklines 2 + | setext -> + contents <> attr' <> cr <> text (replicate (offset contents) '=') <> + blankline + 2 | plain -> blanklines 2 <> contents <> blankline + | setext -> + contents <> attr' <> cr <> text (replicate (offset contents) '-') <> + blankline + -- ghc interprets '#' characters in column 1 as linenum specifiers. + _ | plain || isEnabled Ext_literate_haskell opts -> + contents <> blankline + _ -> text (replicate level '#') <> space <> contents <> attr' <> blankline + + return $ refs <> hdr +blockToMarkdown' opts (CodeBlock (_,classes,_) str) + | "haskell" `elem` classes && "literate" `elem` classes && + isEnabled Ext_literate_haskell opts = + return $ prefixed "> " (text str) <> blankline +blockToMarkdown' opts (CodeBlock attribs str) = return $ + case attribs == nullAttr of + False | isEnabled Ext_backtick_code_blocks opts -> + backticks <> attrs <> cr <> text str <> cr <> backticks <> blankline + | isEnabled Ext_fenced_code_blocks opts -> + tildes <> attrs <> cr <> text str <> cr <> tildes <> blankline + _ -> nest (writerTabStop opts) (text str) <> blankline + where tildes = text $ case [ln | ln <- lines str, all (=='~') ln] of + [] -> "~~~~" + xs -> case maximum $ map length xs of + n | n < 3 -> "~~~~" + | otherwise -> replicate (n+1) '~' + backticks = text $ case [ln | ln <- lines str, all (=='`') ln] of + [] -> "```" + xs -> case maximum $ map length xs of + n | n < 3 -> "```" + | otherwise -> replicate (n+1) '`' + attrs = if isEnabled Ext_fenced_code_attributes opts + then nowrap $ " " <> attrsToMarkdown attribs + else case attribs of + (_,(cls:_),_) -> " " <> text cls + _ -> empty +blockToMarkdown' opts (BlockQuote blocks) = do + plain <- asks envPlain + -- if we're writing literate haskell, put a space before the bird tracks + -- so they won't be interpreted as lhs... + let leader = if isEnabled Ext_literate_haskell opts + then " > " + else if plain then " " else "> " + contents <- blockListToMarkdown opts blocks + return $ (prefixed leader contents) <> blankline +blockToMarkdown' opts t@(Table caption aligns widths headers rows) = do + caption' <- inlineListToMarkdown opts caption + let caption'' = if null caption || not (isEnabled Ext_table_captions opts) + then empty + else blankline <> ": " <> caption' <> blankline + rawHeaders <- mapM (blockListToMarkdown opts) headers + rawRows <- mapM (mapM (blockListToMarkdown opts)) rows + let isLineBreak LineBreak = Any True + isLineBreak _ = Any False + let isSimple = all (==0) widths && + not ( getAny (query isLineBreak (headers:rows)) ) + let isPlainBlock (Plain _) = True + isPlainBlock _ = False + let hasBlocks = not (all isPlainBlock $ concat . concat $ headers:rows) + (nst,tbl) <- case True of + _ | isSimple && + isEnabled Ext_simple_tables opts -> fmap (nest 2,) $ + pandocTable opts (all null headers) aligns widths + rawHeaders rawRows + | isSimple && + isEnabled Ext_pipe_tables opts -> fmap (id,) $ + pipeTable (all null headers) aligns rawHeaders rawRows + | not hasBlocks && + isEnabled Ext_multiline_tables opts -> fmap (nest 2,) $ + pandocTable opts (all null headers) aligns widths + rawHeaders rawRows + | isEnabled Ext_grid_tables opts -> fmap (id,) $ + gridTable opts (all null headers) aligns widths + rawHeaders rawRows + | isEnabled Ext_raw_html opts -> fmap (id,) $ + text <$> + (writeHtml5String def $ Pandoc nullMeta [t]) + | otherwise -> return $ (id, text "[TABLE]") + return $ nst $ tbl $$ blankline $$ caption'' $$ blankline +blockToMarkdown' opts (BulletList items) = do + contents <- inList $ mapM (bulletListItemToMarkdown opts) items + return $ cat contents <> blankline +blockToMarkdown' opts (OrderedList (start,sty,delim) items) = do + let start' = if isEnabled Ext_startnum opts then start else 1 + let sty' = if isEnabled Ext_fancy_lists opts then sty else DefaultStyle + let delim' = if isEnabled Ext_fancy_lists opts then delim else DefaultDelim + let attribs = (start', sty', delim') + let markers = orderedListMarkers attribs + let markers' = map (\m -> if length m < 3 + then m ++ replicate (3 - length m) ' ' + else m) markers + contents <- inList $ + mapM (\(item, num) -> orderedListItemToMarkdown opts item num) $ + zip markers' items + return $ cat contents <> blankline +blockToMarkdown' opts (DefinitionList items) = do + contents <- inList $ mapM (definitionListItemToMarkdown opts) items + return $ cat contents <> blankline + +inList :: Monad m => MD m a -> MD m a +inList p = local (\env -> env {envInList = True}) p + +addMarkdownAttribute :: String -> String +addMarkdownAttribute s = + case span isTagText $ reverse $ parseTags s of + (xs,(TagOpen t attrs:rest)) -> + renderTags' $ reverse rest ++ (TagOpen t attrs' : reverse xs) + where attrs' = ("markdown","1"):[(x,y) | (x,y) <- attrs, + x /= "markdown"] + _ -> s + +pipeTable :: PandocMonad m => Bool -> [Alignment] -> [Doc] -> [[Doc]] -> MD m Doc +pipeTable headless aligns rawHeaders rawRows = do + let sp = text " " + let blockFor AlignLeft x y = lblock (x + 2) (sp <> y) <> lblock 0 empty + blockFor AlignCenter x y = cblock (x + 2) (sp <> y) <> lblock 0 empty + blockFor AlignRight x y = rblock (x + 2) (sp <> y) <> lblock 0 empty + blockFor _ x y = lblock (x + 2) (sp <> y) <> lblock 0 empty + let widths = map (max 3 . maximum . map offset) $ transpose (rawHeaders : rawRows) + let torow cs = nowrap $ text "|" <> + hcat (intersperse (text "|") $ + zipWith3 blockFor aligns widths (map chomp cs)) + <> text "|" + let toborder (a, w) = text $ case a of + AlignLeft -> ':':replicate (w + 1) '-' + AlignCenter -> ':':replicate w '-' ++ ":" + AlignRight -> replicate (w + 1) '-' ++ ":" + AlignDefault -> replicate (w + 2) '-' + -- note: pipe tables can't completely lack a + -- header; for a headerless table, we need a header of empty cells. + -- see jgm/pandoc#1996. + let header = if headless + then torow (replicate (length aligns) empty) + else torow rawHeaders + let border = nowrap $ text "|" <> hcat (intersperse (text "|") $ + map toborder $ zip aligns widths) <> text "|" + let body = vcat $ map torow rawRows + return $ header $$ border $$ body + +pandocTable :: PandocMonad m => WriterOptions -> Bool -> [Alignment] -> [Double] + -> [Doc] -> [[Doc]] -> MD m Doc +pandocTable opts headless aligns widths rawHeaders rawRows = do + let isSimple = all (==0) widths + let alignHeader alignment = case alignment of + AlignLeft -> lblock + AlignCenter -> cblock + AlignRight -> rblock + AlignDefault -> lblock + -- Number of characters per column necessary to output every cell + -- without requiring a line break. + -- The @+2@ is needed for specifying the alignment. + let numChars = (+ 2) . maximum . map offset + -- Number of characters per column necessary to output every cell + -- without requiring a line break *inside a word*. + -- The @+2@ is needed for specifying the alignment. + let minNumChars = (+ 2) . maximum . map minOffset + let columns = transpose (rawHeaders : rawRows) + -- minimal column width without wrapping a single word + let noWordWrapWidth + | writerWrapText opts == WrapAuto + = fromIntegral $ maximum (map minNumChars columns) + | otherwise = fromIntegral $ maximum (map numChars columns) + let relWidth w = floor $ max (fromIntegral (writerColumns opts) * w) + (noWordWrapWidth * w / minimum widths) + let widthsInChars + | isSimple = map numChars columns + | otherwise = map relWidth widths + let makeRow = hcat . intersperse (lblock 1 (text " ")) . + (zipWith3 alignHeader aligns widthsInChars) + let rows' = map makeRow rawRows + let head' = makeRow rawHeaders + let maxRowHeight = maximum $ map height (head':rows') + let underline = cat $ intersperse (text " ") $ + map (\width -> text (replicate width '-')) widthsInChars + let border = if maxRowHeight > 1 + then text (replicate (sum widthsInChars + + length widthsInChars - 1) '-') + else if headless + then underline + else empty + let head'' = if headless + then empty + else border <> cr <> head' + let body = if maxRowHeight > 1 + then vsep rows' + else vcat rows' + let bottom = if headless + then underline + else border + return $ head'' $$ underline $$ body $$ bottom + +gridTable :: PandocMonad m => WriterOptions -> Bool -> [Alignment] -> [Double] + -> [Doc] -> [[Doc]] -> MD m Doc +gridTable opts headless aligns widths headers' rawRows = do + let numcols = length headers' + let widths' = if all (==0) widths + then replicate numcols (1.0 / fromIntegral numcols) + else widths + let widthsInChars = map + ((\x -> x - 3) . floor . (fromIntegral (writerColumns opts) *)) widths' + let hpipeBlocks blocks = hcat [beg, middle, end] + where h = maximum (1 : map height blocks) + sep' = lblock 3 $ vcat (map text $ replicate h " | ") + beg = lblock 2 $ vcat (map text $ replicate h "| ") + end = lblock 2 $ vcat (map text $ replicate h " |") + middle = chomp $ hcat $ intersperse sep' blocks + let makeRow = hpipeBlocks . zipWith lblock widthsInChars + let head' = makeRow headers' + let rows' = map (makeRow . map chomp) rawRows + let borderpart ch align widthInChars = + let widthInChars' = if widthInChars < 1 then 1 else widthInChars + in (if (align == AlignLeft || align == AlignCenter) + then char ':' + else char ch) <> + text (replicate widthInChars' ch) <> + (if (align == AlignRight || align == AlignCenter) + then char ':' + else char ch) + let border ch aligns' widthsInChars' = + char '+' <> + hcat (intersperse (char '+') (zipWith (borderpart ch) + aligns' widthsInChars')) <> char '+' + let body = vcat $ intersperse (border '-' (repeat AlignDefault) widthsInChars) + rows' + let head'' = if headless + then empty + else head' $$ border '=' aligns widthsInChars + if headless + then return $ + border '-' aligns widthsInChars $$ + body $$ + border '-' (repeat AlignDefault) widthsInChars + else return $ + border '-' (repeat AlignDefault) widthsInChars $$ + head'' $$ + body $$ + border '-' (repeat AlignDefault) widthsInChars + +itemEndsWithTightList :: [Block] -> Bool +itemEndsWithTightList bs = + case bs of + [Plain _, BulletList xs] -> isTightList xs + [Plain _, OrderedList _ xs] -> isTightList xs + _ -> False + +-- | Convert bullet list item (list of blocks) to markdown. +bulletListItemToMarkdown :: PandocMonad m => WriterOptions -> [Block] -> MD m Doc +bulletListItemToMarkdown opts bs = do + contents <- blockListToMarkdown opts bs + let sps = replicate (writerTabStop opts - 2) ' ' + let start = text ('-' : ' ' : sps) + -- remove trailing blank line if item ends with a tight list + let contents' = if itemEndsWithTightList bs + then chomp contents <> cr + else contents + return $ hang (writerTabStop opts) start $ contents' <> cr + +-- | Convert ordered list item (a list of blocks) to markdown. +orderedListItemToMarkdown :: PandocMonad m + => WriterOptions -- ^ options + -> String -- ^ list item marker + -> [Block] -- ^ list item (list of blocks) + -> MD m Doc +orderedListItemToMarkdown opts marker bs = do + contents <- blockListToMarkdown opts bs + let sps = case length marker - writerTabStop opts of + n | n > 0 -> text $ replicate n ' ' + _ -> text " " + let start = text marker <> sps + -- remove trailing blank line if item ends with a tight list + let contents' = if itemEndsWithTightList bs + then chomp contents <> cr + else contents + return $ hang (writerTabStop opts) start $ contents' <> cr + +-- | Convert definition list item (label, list of blocks) to markdown. +definitionListItemToMarkdown :: PandocMonad m + => WriterOptions + -> ([Inline],[[Block]]) + -> MD m Doc +definitionListItemToMarkdown opts (label, defs) = do + labelText <- inlineListToMarkdown opts label + defs' <- mapM (mapM (blockToMarkdown opts)) defs + if isEnabled Ext_definition_lists opts + then do + let tabStop = writerTabStop opts + isPlain <- asks envPlain + let leader = if isPlain then " " else ": " + let sps = case writerTabStop opts - 3 of + n | n > 0 -> text $ replicate n ' ' + _ -> text " " + if isEnabled Ext_compact_definition_lists opts + then do + let contents = vcat $ map (\d -> hang tabStop (leader <> sps) + $ vcat d <> cr) defs' + return $ nowrap labelText <> cr <> contents <> cr + else do + let contents = vcat $ map (\d -> hang tabStop (leader <> sps) + $ vcat d <> cr) defs' + let isTight = case defs of + ((Plain _ : _): _) -> True + _ -> False + return $ blankline <> nowrap labelText <> + (if isTight then cr else blankline) <> contents <> blankline + else do + return $ nowrap labelText <> text " " <> cr <> + vsep (map vsep defs') <> blankline + +-- | Convert list of Pandoc block elements to markdown. +blockListToMarkdown :: PandocMonad m + => WriterOptions -- ^ Options + -> [Block] -- ^ List of block elements + -> MD m Doc +blockListToMarkdown opts blocks = + mapM (blockToMarkdown opts) (fixBlocks blocks) >>= return . cat + -- insert comment between list and indented code block, or the + -- code block will be treated as a list continuation paragraph + where fixBlocks (b : CodeBlock attr x : rest) + | (not (isEnabled Ext_fenced_code_blocks opts) || attr == nullAttr) + && isListBlock b = b : commentSep : CodeBlock attr x : + fixBlocks rest + fixBlocks (b1@(BulletList _) : b2@(BulletList _) : bs) = + b1 : commentSep : fixBlocks (b2:bs) + fixBlocks (b1@(OrderedList _ _) : b2@(OrderedList _ _) : bs) = + b1 : commentSep : fixBlocks (b2:bs) + fixBlocks (b1@(DefinitionList _) : b2@(DefinitionList _) : bs) = + b1 : commentSep : fixBlocks (b2:bs) + fixBlocks (x : xs) = x : fixBlocks xs + fixBlocks [] = [] + isListBlock (BulletList _) = True + isListBlock (OrderedList _ _) = True + isListBlock (DefinitionList _) = True + isListBlock _ = False + commentSep = if isEnabled Ext_raw_html opts + then RawBlock "html" "<!-- -->\n" + else RawBlock "markdown" " " + +-- | Get reference for target; if none exists, create unique one and return. +-- Prefer label if possible; otherwise, generate a unique key. +getReference :: PandocMonad m => Attr -> [Inline] -> Target -> MD m [Inline] +getReference attr label target = do + st <- get + case find (\(_,t,a) -> t == target && a == attr) (stRefs st) of + Just (ref, _, _) -> return ref + Nothing -> do + label' <- case find (\(l,_,_) -> l == label) (stRefs st) of + Just _ -> -- label is used; generate numerical label + case find (\n -> notElem [Str (show n)] + (map (\(l,_,_) -> l) (stRefs st))) + [1..(10000 :: Integer)] of + Just x -> return [Str (show x)] + Nothing -> throwError $ PandocSomeError "no unique label" + Nothing -> return label + modify (\s -> s{ stRefs = (label', target, attr) : stRefs st }) + return label' + +-- | Convert list of Pandoc inline elements to markdown. +inlineListToMarkdown :: PandocMonad m => WriterOptions -> [Inline] -> MD m Doc +inlineListToMarkdown opts lst = do + inlist <- asks envInList + go (if inlist then avoidBadWrapsInList lst else lst) + where go [] = return empty + go (i:is) = case i of + (Link _ _ _) -> case is of + -- If a link is followed by another link or '[' we don't shortcut + (Link _ _ _):_ -> unshortcutable + Space:(Link _ _ _):_ -> unshortcutable + Space:(Str('[':_)):_ -> unshortcutable + Space:(RawInline _ ('[':_)):_ -> unshortcutable + Space:(Cite _ _):_ -> unshortcutable + SoftBreak:(Link _ _ _):_ -> unshortcutable + SoftBreak:(Str('[':_)):_ -> unshortcutable + SoftBreak:(RawInline _ ('[':_)):_ -> unshortcutable + SoftBreak:(Cite _ _):_ -> unshortcutable + (Cite _ _):_ -> unshortcutable + Str ('[':_):_ -> unshortcutable + (RawInline _ ('[':_)):_ -> unshortcutable + (RawInline _ (' ':'[':_)):_ -> unshortcutable + _ -> shortcutable + _ -> shortcutable + where shortcutable = liftM2 (<>) (inlineToMarkdown opts i) (go is) + unshortcutable = do + iMark <- local + (\env -> env { envRefShortcutable = False }) + (inlineToMarkdown opts i) + fmap (iMark <>) (go is) + +isSp :: Inline -> Bool +isSp Space = True +isSp SoftBreak = True +isSp _ = False + +avoidBadWrapsInList :: [Inline] -> [Inline] +avoidBadWrapsInList [] = [] +avoidBadWrapsInList (s:Str ('>':cs):xs) | isSp s = + Str (' ':'>':cs) : avoidBadWrapsInList xs +avoidBadWrapsInList (s:Str [c]:[]) + | isSp s && c `elem` ['-','*','+'] = Str [' ', c] : [] +avoidBadWrapsInList (s:Str [c]:Space:xs) + | isSp s && c `elem` ['-','*','+'] = + Str [' ', c] : Space : avoidBadWrapsInList xs +avoidBadWrapsInList (s:Str cs:Space:xs) + | isSp s && isOrderedListMarker cs = + Str (' ':cs) : Space : avoidBadWrapsInList xs +avoidBadWrapsInList (s:Str cs:[]) + | isSp s && isOrderedListMarker cs = Str (' ':cs) : [] +avoidBadWrapsInList (x:xs) = x : avoidBadWrapsInList xs + +isOrderedListMarker :: String -> Bool +isOrderedListMarker xs = (last xs `elem` ['.',')']) && + isRight (runParser (anyOrderedListMarker >> eof) + defaultParserState "" xs) + +isRight :: Either a b -> Bool +isRight (Right _) = True +isRight (Left _) = False + +-- | Convert Pandoc inline element to markdown. +inlineToMarkdown :: PandocMonad m => WriterOptions -> Inline -> MD m Doc +inlineToMarkdown opts (Span attrs ils) = do + plain <- asks envPlain + contents <- inlineListToMarkdown opts ils + return $ case plain of + True -> contents + False | isEnabled Ext_bracketed_spans opts -> + "[" <> contents <> "]" <> + if attrs == nullAttr + then "{}" + else linkAttributes opts attrs + | isEnabled Ext_raw_html opts || + isEnabled Ext_native_spans opts -> + tagWithAttrs "span" attrs <> contents <> text "</span>" + | otherwise -> contents +inlineToMarkdown opts (Emph lst) = do + plain <- asks envPlain + contents <- inlineListToMarkdown opts lst + return $ if plain + then "_" <> contents <> "_" + else "*" <> contents <> "*" +inlineToMarkdown opts (Strong lst) = do + plain <- asks envPlain + if plain + then inlineListToMarkdown opts $ capitalize lst + else do + contents <- inlineListToMarkdown opts lst + return $ "**" <> contents <> "**" +inlineToMarkdown opts (Strikeout lst) = do + contents <- inlineListToMarkdown opts lst + return $ if isEnabled Ext_strikeout opts + then "~~" <> contents <> "~~" + else if isEnabled Ext_raw_html opts + then "<s>" <> contents <> "</s>" + else contents +inlineToMarkdown opts (Superscript lst) = + local (\env -> env {envEscapeSpaces = True}) $ do + contents <- inlineListToMarkdown opts lst + return $ if isEnabled Ext_superscript opts + then "^" <> contents <> "^" + else if isEnabled Ext_raw_html opts + then "<sup>" <> contents <> "</sup>" + else case (render Nothing contents) of + ds | all (\d -> d >= '0' && d <= '9') ds + -> text (map toSuperscript ds) + _ -> contents + where toSuperscript '1' = '\x00B9' + toSuperscript '2' = '\x00B2' + toSuperscript '3' = '\x00B3' + toSuperscript c = chr (0x2070 + (ord c - 48)) +inlineToMarkdown opts (Subscript lst) = + local (\env -> env {envEscapeSpaces = True}) $ do + contents <- inlineListToMarkdown opts lst + return $ if isEnabled Ext_subscript opts + then "~" <> contents <> "~" + else if isEnabled Ext_raw_html opts + then "<sub>" <> contents <> "</sub>" + else case (render Nothing contents) of + ds | all (\d -> d >= '0' && d <= '9') ds + -> text (map toSubscript ds) + _ -> contents + where toSubscript c = chr (0x2080 + (ord c - 48)) +inlineToMarkdown opts (SmallCaps lst) = do + plain <- asks envPlain + if not plain && + (isEnabled Ext_raw_html opts || isEnabled Ext_native_spans opts) + then do + contents <- inlineListToMarkdown opts lst + return $ tagWithAttrs "span" + ("",[],[("style","font-variant:small-caps;")]) + <> contents <> text "</span>" + else inlineListToMarkdown opts $ capitalize lst +inlineToMarkdown opts (Quoted SingleQuote lst) = do + contents <- inlineListToMarkdown opts lst + return $ if isEnabled Ext_smart opts + then "'" <> contents <> "'" + else "‘" <> contents <> "’" +inlineToMarkdown opts (Quoted DoubleQuote lst) = do + contents <- inlineListToMarkdown opts lst + return $ if isEnabled Ext_smart opts + then "\"" <> contents <> "\"" + else "“" <> contents <> "”" +inlineToMarkdown opts (Code attr str) = do + let tickGroups = filter (\s -> '`' `elem` s) $ group str + let longest = if null tickGroups + then 0 + else maximum $ map length tickGroups + let marker = replicate (longest + 1) '`' + let spacer = if (longest == 0) then "" else " " + let attrs = if isEnabled Ext_inline_code_attributes opts && attr /= nullAttr + then attrsToMarkdown attr + else empty + plain <- asks envPlain + if plain + then return $ text str + else return $ text (marker ++ spacer ++ str ++ spacer ++ marker) <> attrs +inlineToMarkdown opts (Str str) = do + isPlain <- asks envPlain + let str' = (if isEnabled Ext_smart opts + then unsmartify opts + else id) $ + if isPlain + then str + else escapeString opts str + return $ text str' +inlineToMarkdown opts (Math InlineMath str) = + case writerHTMLMathMethod opts of + WebTeX url -> + inlineToMarkdown opts (Image nullAttr [Str str] + (url ++ urlEncode str, str)) + _ | isEnabled Ext_tex_math_dollars opts -> + return $ "$" <> text str <> "$" + | isEnabled Ext_tex_math_single_backslash opts -> + return $ "\\(" <> text str <> "\\)" + | isEnabled Ext_tex_math_double_backslash opts -> + return $ "\\\\(" <> text str <> "\\\\)" + | otherwise -> do + plain <- asks envPlain + texMathToInlines InlineMath str >>= + inlineListToMarkdown opts . + (if plain then makeMathPlainer else id) +inlineToMarkdown opts (Math DisplayMath str) = + case writerHTMLMathMethod opts of + WebTeX url -> (\x -> blankline <> x <> blankline) `fmap` + inlineToMarkdown opts (Image nullAttr [Str str] + (url ++ urlEncode str, str)) + _ | isEnabled Ext_tex_math_dollars opts -> + return $ "$$" <> text str <> "$$" + | isEnabled Ext_tex_math_single_backslash opts -> + return $ "\\[" <> text str <> "\\]" + | isEnabled Ext_tex_math_double_backslash opts -> + return $ "\\\\[" <> text str <> "\\\\]" + | otherwise -> (\x -> cr <> x <> cr) `fmap` + (texMathToInlines DisplayMath str >>= inlineListToMarkdown opts) +inlineToMarkdown opts il@(RawInline f str) = do + plain <- asks envPlain + if not plain && + ( f == "markdown" || + (isEnabled Ext_raw_tex opts && (f == "latex" || f == "tex")) || + (isEnabled Ext_raw_html opts && f == "html") ) + then return $ text str + else do + report $ InlineNotRendered il + return empty +inlineToMarkdown opts (LineBreak) = do + plain <- asks envPlain + if plain || isEnabled Ext_hard_line_breaks opts + then return cr + else return $ + if isEnabled Ext_escaped_line_breaks opts + then "\\" <> cr + else " " <> cr +inlineToMarkdown _ Space = do + escapeSpaces <- asks envEscapeSpaces + return $ if escapeSpaces then "\\ " else space +inlineToMarkdown opts SoftBreak = do + escapeSpaces <- asks envEscapeSpaces + let space' = if escapeSpaces then "\\ " else space + return $ case writerWrapText opts of + WrapNone -> space' + WrapAuto -> space' + WrapPreserve -> cr +inlineToMarkdown opts (Cite [] lst) = inlineListToMarkdown opts lst +inlineToMarkdown opts (Cite (c:cs) lst) + | not (isEnabled Ext_citations opts) = inlineListToMarkdown opts lst + | otherwise = + if citationMode c == AuthorInText + then do + suffs <- inlineListToMarkdown opts $ citationSuffix c + rest <- mapM convertOne cs + let inbr = suffs <+> joincits rest + br = if isEmpty inbr then empty else char '[' <> inbr <> char ']' + return $ text ("@" ++ citationId c) <+> br + else do + cits <- mapM convertOne (c:cs) + return $ text "[" <> joincits cits <> text "]" + where + joincits = hcat . intersperse (text "; ") . filter (not . isEmpty) + convertOne Citation { citationId = k + , citationPrefix = pinlines + , citationSuffix = sinlines + , citationMode = m } + = do + pdoc <- inlineListToMarkdown opts pinlines + sdoc <- inlineListToMarkdown opts sinlines + let k' = text (modekey m ++ "@" ++ k) + r = case sinlines of + Str (y:_):_ | y `elem` (",;]@" :: String) -> k' <> sdoc + _ -> k' <+> sdoc + return $ pdoc <+> r + modekey SuppressAuthor = "-" + modekey _ = "" +inlineToMarkdown opts lnk@(Link attr txt (src, tit)) + | isEnabled Ext_raw_html opts && + not (isEnabled Ext_link_attributes opts) && + attr /= nullAttr = -- use raw HTML + (text . trim) <$> writeHtml5String def (Pandoc nullMeta [Plain [lnk]]) + | otherwise = do + plain <- asks envPlain + linktext <- inlineListToMarkdown opts txt + let linktitle = if null tit + then empty + else text $ " \"" ++ tit ++ "\"" + let srcSuffix = fromMaybe src (stripPrefix "mailto:" src) + let useAuto = isURI src && + case txt of + [Str s] | escapeURI s == srcSuffix -> True + _ -> False + let useRefLinks = writerReferenceLinks opts && not useAuto + shortcutable <- asks envRefShortcutable + let useShortcutRefLinks = shortcutable && + isEnabled Ext_shortcut_reference_links opts + ref <- if useRefLinks then getReference attr txt (src, tit) else return [] + reftext <- inlineListToMarkdown opts ref + return $ if useAuto + then if plain + then text srcSuffix + else "<" <> text srcSuffix <> ">" + else if useRefLinks + then let first = "[" <> linktext <> "]" + second = if txt == ref + then if useShortcutRefLinks + then "" + else "[]" + else "[" <> reftext <> "]" + in first <> second + else if plain + then linktext + else "[" <> linktext <> "](" <> + text src <> linktitle <> ")" <> + linkAttributes opts attr +inlineToMarkdown opts img@(Image attr alternate (source, tit)) + | isEnabled Ext_raw_html opts && + not (isEnabled Ext_link_attributes opts) && + attr /= nullAttr = -- use raw HTML + (text . trim) <$> writeHtml5String def (Pandoc nullMeta [Plain [img]]) + | otherwise = do + plain <- asks envPlain + let txt = if null alternate || alternate == [Str source] + -- to prevent autolinks + then [Str ""] + else alternate + linkPart <- inlineToMarkdown opts (Link attr txt (source, tit)) + return $ if plain + then "[" <> linkPart <> "]" + else "!" <> linkPart +inlineToMarkdown opts (Note contents) = do + modify (\st -> st{ stNotes = contents : stNotes st }) + st <- get + let ref = text $ writerIdentifierPrefix opts ++ show (stNoteNum st + (length $ stNotes st) - 1) + if isEnabled Ext_footnotes opts + then return $ "[^" <> ref <> "]" + else return $ "[" <> ref <> "]" + +makeMathPlainer :: [Inline] -> [Inline] +makeMathPlainer = walk go + where + go (Emph xs) = Span nullAttr xs + go x = x + diff --git a/src/Text/Pandoc/Writers/Math.hs b/src/Text/Pandoc/Writers/Math.hs new file mode 100644 index 000000000..b7419ddf9 --- /dev/null +++ b/src/Text/Pandoc/Writers/Math.hs @@ -0,0 +1,49 @@ +module Text.Pandoc.Writers.Math + ( texMathToInlines + , convertMath + ) +where + +import Text.Pandoc.Class +import Text.Pandoc.Definition +import Text.Pandoc.Logging +import Text.TeXMath (Exp, writePandoc, DisplayType(..), readTeX) + +-- | Converts a raw TeX math formula to a list of 'Pandoc' inlines. +-- Defaults to raw formula between @$@ or @$$@ characters if entire formula +-- can't be converted. +texMathToInlines :: PandocMonad m + => MathType + -> String -- ^ String to parse (assumes @'\n'@ line endings) + -> m [Inline] +texMathToInlines mt inp = do + res <- convertMath writePandoc mt inp + case res of + Right (Just ils) -> return ils + Right (Nothing) -> do + report $ CouldNotConvertTeXMath inp "" + return [mkFallback mt inp] + Left il -> return [il] + +mkFallback :: MathType -> String -> Inline +mkFallback mt str = Str (delim ++ str ++ delim) + where delim = case mt of + DisplayMath -> "$$" + InlineMath -> "$" + +-- | Converts a raw TeX math formula using a writer function, +-- issuing a warning and producing a fallback (a raw string) +-- on failure. +convertMath :: PandocMonad m + => (DisplayType -> [Exp] -> a) -> MathType -> String + -> m (Either Inline a) +convertMath writer mt str = do + case writer dt <$> readTeX str of + Right r -> return (Right r) + Left e -> do + report $ CouldNotConvertTeXMath str e + return (Left $ mkFallback mt str) + where dt = case mt of + DisplayMath -> DisplayBlock + InlineMath -> DisplayInline + diff --git a/src/Text/Pandoc/Writers/MediaWiki.hs b/src/Text/Pandoc/Writers/MediaWiki.hs new file mode 100644 index 000000000..dc6206e6c --- /dev/null +++ b/src/Text/Pandoc/Writers/MediaWiki.hs @@ -0,0 +1,442 @@ +{- +Copyright (C) 2008-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.MediaWiki + Copyright : Copyright (C) 2008-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to MediaWiki markup. + +MediaWiki: <http://www.mediawiki.org/wiki/MediaWiki> +-} +module Text.Pandoc.Writers.MediaWiki ( writeMediaWiki ) where +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Pretty (render) +import Text.Pandoc.ImageSize +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.XML ( escapeStringForXML ) +import Data.List ( intersect, intercalate ) +import Network.URI ( isURI ) +import Control.Monad.Reader +import Control.Monad.State +import Text.Pandoc.Class (PandocMonad) + +data WriterState = WriterState { + stNotes :: Bool -- True if there are notes + , stOptions :: WriterOptions -- writer options + } + +data WriterReader = WriterReader { + options :: WriterOptions -- Writer options + , listLevel :: String -- String at beginning of list items, e.g. "**" + , useTags :: Bool -- True if we should use HTML tags because we're in a complex list + } + +type MediaWikiWriter = ReaderT WriterReader (State WriterState) + +-- | Convert Pandoc to MediaWiki. +writeMediaWiki :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeMediaWiki opts document = return $ + let initialState = WriterState { stNotes = False, stOptions = opts } + env = WriterReader { options = opts, listLevel = [], useTags = False } + in evalState (runReaderT (pandocToMediaWiki document) env) initialState + +-- | Return MediaWiki representation of document. +pandocToMediaWiki :: Pandoc -> MediaWikiWriter String +pandocToMediaWiki (Pandoc meta blocks) = do + opts <- asks options + metadata <- metaToJSON opts + (fmap trimr . blockListToMediaWiki) + inlineListToMediaWiki + meta + body <- blockListToMediaWiki blocks + notesExist <- gets stNotes + let notes = if notesExist + then "\n<references />" + else "" + let main = body ++ notes + let context = defField "body" main + $ defField "toc" (writerTableOfContents opts) metadata + return $ case writerTemplate opts of + Nothing -> main + Just tpl -> renderTemplate' tpl context + +-- | Escape special characters for MediaWiki. +escapeString :: String -> String +escapeString = escapeStringForXML + +-- | Convert Pandoc block element to MediaWiki. +blockToMediaWiki :: Block -- ^ Block element + -> MediaWikiWriter String + +blockToMediaWiki Null = return "" + +blockToMediaWiki (Div attrs bs) = do + contents <- blockListToMediaWiki bs + return $ render Nothing (tagWithAttrs "div" attrs) ++ "\n\n" ++ + contents ++ "\n\n" ++ "</div>" + +blockToMediaWiki (Plain inlines) = + inlineListToMediaWiki inlines + +-- title beginning with fig: indicates that the image is a figure +blockToMediaWiki (Para [Image attr txt (src,'f':'i':'g':':':tit)]) = do + capt <- if null txt + then return "" + else ("|caption " ++) `fmap` inlineListToMediaWiki txt + img <- imageToMediaWiki attr + let opt = if null txt + then "" + else "|alt=" ++ if null tit then capt else tit ++ capt + return $ "[[File:" ++ src ++ "|frame|none" ++ img ++ opt ++ "]]\n" + +blockToMediaWiki (Para inlines) = do + tags <- asks useTags + lev <- asks listLevel + contents <- inlineListToMediaWiki inlines + return $ if tags + then "<p>" ++ contents ++ "</p>" + else contents ++ if null lev then "\n" else "" + +blockToMediaWiki (LineBlock lns) = + blockToMediaWiki $ linesToPara lns + +blockToMediaWiki (RawBlock f str) + | f == Format "mediawiki" = return str + | f == Format "html" = return str + | otherwise = return "" + +blockToMediaWiki HorizontalRule = return "\n-----\n" + +blockToMediaWiki (Header level _ inlines) = do + contents <- inlineListToMediaWiki inlines + let eqs = replicate level '=' + return $ eqs ++ " " ++ contents ++ " " ++ eqs ++ "\n" + +blockToMediaWiki (CodeBlock (_,classes,_) str) = do + let at = classes `intersect` ["actionscript", "ada", "apache", "applescript", "asm", "asp", + "autoit", "bash", "blitzbasic", "bnf", "c", "c_mac", "caddcl", "cadlisp", "cfdg", "cfm", + "cpp", "cpp-qt", "csharp", "css", "d", "delphi", "diff", "div", "dos", "eiffel", "fortran", + "freebasic", "gml", "groovy", "html4strict", "idl", "ini", "inno", "io", "java", "java5", + "javascript", "latex", "lisp", "lua", "matlab", "mirc", "mpasm", "mysql", "nsis", "objc", + "ocaml", "ocaml-brief", "oobas", "oracle8", "pascal", "perl", "php", "php-brief", "plsql", + "python", "qbasic", "rails", "reg", "robots", "ruby", "sas", "scheme", "sdlbasic", + "smalltalk", "smarty", "sql", "tcl", "", "thinbasic", "tsql", "vb", "vbnet", "vhdl", + "visualfoxpro", "winbatch", "xml", "xpp", "z80"] + return $ + if null at + then "<pre" ++ (if null classes + then ">" + else " class=\"" ++ unwords classes ++ "\">") ++ + escapeString str ++ "</pre>" + else "<source lang=\"" ++ head at ++ "\">" ++ str ++ "</source>" + -- note: no escape! + +blockToMediaWiki (BlockQuote blocks) = do + contents <- blockListToMediaWiki blocks + return $ "<blockquote>" ++ contents ++ "</blockquote>" + +blockToMediaWiki (Table capt aligns widths headers rows') = do + caption <- if null capt + then return "" + else do + c <- inlineListToMediaWiki capt + return $ "|+ " ++ trimr c ++ "\n" + let headless = all null headers + let allrows = if headless then rows' else headers:rows' + tableBody <- intercalate "|-\n" `fmap` + mapM (tableRowToMediaWiki headless aligns widths) + (zip [1..] allrows) + return $ "{|\n" ++ caption ++ tableBody ++ "|}\n" + +blockToMediaWiki x@(BulletList items) = do + tags <- fmap (|| not (isSimpleList x)) $ asks useTags + if tags + then do + contents <- local (\ s -> s { useTags = True }) $ mapM listItemToMediaWiki items + return $ "<ul>\n" ++ vcat contents ++ "</ul>\n" + else do + lev <- asks listLevel + contents <- local (\s -> s { listLevel = listLevel s ++ "*" }) $ mapM listItemToMediaWiki items + return $ vcat contents ++ if null lev then "\n" else "" + +blockToMediaWiki x@(OrderedList attribs items) = do + tags <- fmap (|| not (isSimpleList x)) $ asks useTags + if tags + then do + contents <- local (\s -> s { useTags = True }) $ mapM listItemToMediaWiki items + return $ "<ol" ++ listAttribsToString attribs ++ ">\n" ++ vcat contents ++ "</ol>\n" + else do + lev <- asks listLevel + contents <- local (\s -> s { listLevel = listLevel s ++ "#" }) $ mapM listItemToMediaWiki items + return $ vcat contents ++ if null lev then "\n" else "" + +blockToMediaWiki x@(DefinitionList items) = do + tags <- fmap (|| not (isSimpleList x)) $ asks useTags + if tags + then do + contents <- local (\s -> s { useTags = True }) $ mapM definitionListItemToMediaWiki items + return $ "<dl>\n" ++ vcat contents ++ "</dl>\n" + else do + lev <- asks listLevel + contents <- local (\s -> s { listLevel = listLevel s ++ ";" }) $ mapM definitionListItemToMediaWiki items + return $ vcat contents ++ if null lev then "\n" else "" + +-- Auxiliary functions for lists: + +-- | Convert ordered list attributes to HTML attribute string +listAttribsToString :: ListAttributes -> String +listAttribsToString (startnum, numstyle, _) = + let numstyle' = camelCaseToHyphenated $ show numstyle + in (if startnum /= 1 + then " start=\"" ++ show startnum ++ "\"" + else "") ++ + (if numstyle /= DefaultStyle + then " style=\"list-style-type: " ++ numstyle' ++ ";\"" + else "") + +-- | Convert bullet or ordered list item (list of blocks) to MediaWiki. +listItemToMediaWiki :: [Block] -> MediaWikiWriter String +listItemToMediaWiki items = do + contents <- blockListToMediaWiki items + tags <- asks useTags + if tags + then return $ "<li>" ++ contents ++ "</li>" + else do + marker <- asks listLevel + return $ marker ++ " " ++ contents + +-- | Convert definition list item (label, list of blocks) to MediaWiki. +definitionListItemToMediaWiki :: ([Inline],[[Block]]) + -> MediaWikiWriter String +definitionListItemToMediaWiki (label, items) = do + labelText <- inlineListToMediaWiki label + contents <- mapM blockListToMediaWiki items + tags <- asks useTags + if tags + then return $ "<dt>" ++ labelText ++ "</dt>\n" ++ + intercalate "\n" (map (\d -> "<dd>" ++ d ++ "</dd>") contents) + else do + marker <- asks listLevel + return $ marker ++ " " ++ labelText ++ "\n" ++ + intercalate "\n" (map (\d -> init marker ++ ": " ++ d) contents) + +-- | True if the list can be handled by simple wiki markup, False if HTML tags will be needed. +isSimpleList :: Block -> Bool +isSimpleList x = + case x of + BulletList items -> all isSimpleListItem items + OrderedList (num, sty, _) items -> all isSimpleListItem items && + num == 1 && sty `elem` [DefaultStyle, Decimal] + DefinitionList items -> all isSimpleListItem $ concatMap snd items + _ -> False + +-- | True if list item can be handled with the simple wiki syntax. False if +-- HTML tags will be needed. +isSimpleListItem :: [Block] -> Bool +isSimpleListItem [] = True +isSimpleListItem [x] = + case x of + Plain _ -> True + Para _ -> True + BulletList _ -> isSimpleList x + OrderedList _ _ -> isSimpleList x + DefinitionList _ -> isSimpleList x + _ -> False +isSimpleListItem [x, y] | isPlainOrPara x = + case y of + BulletList _ -> isSimpleList y + OrderedList _ _ -> isSimpleList y + DefinitionList _ -> isSimpleList y + _ -> False +isSimpleListItem _ = False + +isPlainOrPara :: Block -> Bool +isPlainOrPara (Plain _) = True +isPlainOrPara (Para _) = True +isPlainOrPara _ = False + +-- | Concatenates strings with line breaks between them. +vcat :: [String] -> String +vcat = intercalate "\n" + +-- Auxiliary functions for tables: + +tableRowToMediaWiki :: Bool + -> [Alignment] + -> [Double] + -> (Int, [[Block]]) + -> MediaWikiWriter String +tableRowToMediaWiki headless alignments widths (rownum, cells) = do + cells' <- mapM (tableCellToMediaWiki headless rownum) + $ zip3 alignments widths cells + return $ unlines cells' + +tableCellToMediaWiki :: Bool + -> Int + -> (Alignment, Double, [Block]) + -> MediaWikiWriter String +tableCellToMediaWiki headless rownum (alignment, width, bs) = do + contents <- blockListToMediaWiki bs + let marker = if rownum == 1 && not headless then "!" else "|" + let percent w = show (truncate (100*w) :: Integer) ++ "%" + let attrs = ["align=" ++ show (alignmentToString alignment) | + alignment /= AlignDefault && alignment /= AlignLeft] ++ + ["width=\"" ++ percent width ++ "\"" | + width /= 0.0 && rownum == 1] + let attr = if null attrs + then "" + else unwords attrs ++ "|" + let sep = case bs of + [Plain _] -> " " + [Para _] -> " " + _ -> "\n" + return $ marker ++ attr ++ sep ++ trimr contents + +alignmentToString :: Alignment -> String +alignmentToString alignment = case alignment of + AlignLeft -> "left" + AlignRight -> "right" + AlignCenter -> "center" + AlignDefault -> "left" + +imageToMediaWiki :: Attr -> MediaWikiWriter String +imageToMediaWiki attr = do + opts <- gets stOptions + let (_, cls, _) = attr + toPx = fmap (showInPixel opts) . checkPct + checkPct (Just (Percent _)) = Nothing + checkPct maybeDim = maybeDim + go (Just w) Nothing = '|':w ++ "px" + go (Just w) (Just h) = '|':w ++ "x" ++ h ++ "px" + go Nothing (Just h) = "|x" ++ h ++ "px" + go Nothing Nothing = "" + dims = go (toPx $ dimension Width attr) (toPx $ dimension Height attr) + classes = if null cls + then "" + else "|class=" ++ unwords cls + return $ dims ++ classes + +-- | Convert list of Pandoc block elements to MediaWiki. +blockListToMediaWiki :: [Block] -- ^ List of block elements + -> MediaWikiWriter String +blockListToMediaWiki blocks = + fmap vcat $ mapM blockToMediaWiki blocks + +-- | Convert list of Pandoc inline elements to MediaWiki. +inlineListToMediaWiki :: [Inline] -> MediaWikiWriter String +inlineListToMediaWiki lst = + fmap concat $ mapM inlineToMediaWiki lst + +-- | Convert Pandoc inline element to MediaWiki. +inlineToMediaWiki :: Inline -> MediaWikiWriter String + +inlineToMediaWiki (Span attrs ils) = do + contents <- inlineListToMediaWiki ils + return $ render Nothing (tagWithAttrs "span" attrs) ++ contents ++ "</span>" + +inlineToMediaWiki (Emph lst) = do + contents <- inlineListToMediaWiki lst + return $ "''" ++ contents ++ "''" + +inlineToMediaWiki (Strong lst) = do + contents <- inlineListToMediaWiki lst + return $ "'''" ++ contents ++ "'''" + +inlineToMediaWiki (Strikeout lst) = do + contents <- inlineListToMediaWiki lst + return $ "<s>" ++ contents ++ "</s>" + +inlineToMediaWiki (Superscript lst) = do + contents <- inlineListToMediaWiki lst + return $ "<sup>" ++ contents ++ "</sup>" + +inlineToMediaWiki (Subscript lst) = do + contents <- inlineListToMediaWiki lst + return $ "<sub>" ++ contents ++ "</sub>" + +inlineToMediaWiki (SmallCaps lst) = inlineListToMediaWiki lst + +inlineToMediaWiki (Quoted SingleQuote lst) = do + contents <- inlineListToMediaWiki lst + return $ "\8216" ++ contents ++ "\8217" + +inlineToMediaWiki (Quoted DoubleQuote lst) = do + contents <- inlineListToMediaWiki lst + return $ "\8220" ++ contents ++ "\8221" + +inlineToMediaWiki (Cite _ lst) = inlineListToMediaWiki lst + +inlineToMediaWiki (Code _ str) = + return $ "<code>" ++ escapeString str ++ "</code>" + +inlineToMediaWiki (Str str) = return $ escapeString str + +inlineToMediaWiki (Math _ str) = return $ "<math>" ++ str ++ "</math>" + -- note: str should NOT be escaped + +inlineToMediaWiki (RawInline f str) + | f == Format "mediawiki" = return str + | f == Format "html" = return str + | otherwise = return "" + +inlineToMediaWiki LineBreak = return "<br />\n" + +inlineToMediaWiki SoftBreak = do + wrapText <- gets (writerWrapText . stOptions) + case wrapText of + WrapAuto -> return " " + WrapNone -> return " " + WrapPreserve -> return "\n" + +inlineToMediaWiki Space = return " " + +inlineToMediaWiki (Link _ txt (src, _)) = do + label <- inlineListToMediaWiki txt + case txt of + [Str s] | isURI src && escapeURI s == src -> return src + _ -> return $ if isURI src + then "[" ++ src ++ " " ++ label ++ "]" + else "[[" ++ src' ++ "|" ++ label ++ "]]" + where src' = case src of + '/':xs -> xs -- with leading / it's a + _ -> src -- link to a help page + +inlineToMediaWiki (Image attr alt (source, tit)) = do + img <- imageToMediaWiki attr + alt' <- inlineListToMediaWiki alt + let txt = if null tit + then if null alt + then "" + else '|' : alt' + else '|' : tit + return $ "[[File:" ++ source ++ img ++ txt ++ "]]" + +inlineToMediaWiki (Note contents) = do + contents' <- blockListToMediaWiki contents + modify (\s -> s { stNotes = True }) + return $ "<ref>" ++ contents' ++ "</ref>" + -- note - may not work for notes with multiple blocks diff --git a/src/Text/Pandoc/Writers/Native.hs b/src/Text/Pandoc/Writers/Native.hs new file mode 100644 index 000000000..2421fd94d --- /dev/null +++ b/src/Text/Pandoc/Writers/Native.hs @@ -0,0 +1,79 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Native + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of a 'Pandoc' document to a string representation. +-} +module Text.Pandoc.Writers.Native ( writeNative ) +where +import Text.Pandoc.Options ( WriterOptions(..), WrapOption(..) ) +import Data.List ( intersperse ) +import Text.Pandoc.Definition +import Text.Pandoc.Pretty +import Text.Pandoc.Class (PandocMonad) + +prettyList :: [Doc] -> Doc +prettyList ds = + "[" <> (cat $ intersperse (cr <> ",") $ map (nest 1) ds) <> "]" + +-- | Prettyprint Pandoc block element. +prettyBlock :: Block -> Doc +prettyBlock (LineBlock lines') = + "LineBlock" $$ prettyList (map (text . show) lines') +prettyBlock (BlockQuote blocks) = + "BlockQuote" $$ prettyList (map prettyBlock blocks) +prettyBlock (OrderedList attribs blockLists) = + "OrderedList" <> space <> text (show attribs) $$ + (prettyList $ map (prettyList . map prettyBlock) blockLists) +prettyBlock (BulletList blockLists) = + "BulletList" $$ + (prettyList $ map (prettyList . map prettyBlock) blockLists) +prettyBlock (DefinitionList items) = "DefinitionList" $$ + (prettyList $ map deflistitem items) + where deflistitem (term, defs) = "(" <> text (show term) <> "," <> cr <> + nest 1 (prettyList $ map (prettyList . map prettyBlock) defs) <> ")" +prettyBlock (Table caption aligns widths header rows) = + "Table " <> text (show caption) <> " " <> text (show aligns) <> " " <> + text (show widths) $$ + prettyRow header $$ + prettyList (map prettyRow rows) + where prettyRow cols = prettyList (map (prettyList . map prettyBlock) cols) +prettyBlock (Div attr blocks) = + text ("Div " <> show attr) $$ prettyList (map prettyBlock blocks) +prettyBlock block = text $ show block + +-- | Prettyprint Pandoc document. +writeNative :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeNative opts (Pandoc meta blocks) = return $ + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + withHead = case writerTemplate opts of + Just _ -> \bs -> text ("Pandoc (" ++ show meta ++ ")") $$ + bs $$ cr + Nothing -> id + in render colwidth $ withHead $ prettyList $ map prettyBlock blocks diff --git a/src/Text/Pandoc/Writers/ODT.hs b/src/Text/Pandoc/Writers/ODT.hs new file mode 100644 index 000000000..ee5fa4c24 --- /dev/null +++ b/src/Text/Pandoc/Writers/ODT.hs @@ -0,0 +1,210 @@ +{-# LANGUAGE ScopedTypeVariables #-} +{- +Copyright (C) 2008-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.ODT + Copyright : Copyright (C) 2008-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to ODT. +-} +module Text.Pandoc.Writers.ODT ( writeODT ) where +import Data.List ( isPrefixOf ) +import Data.Maybe ( fromMaybe ) +import Text.XML.Light.Output +import Text.TeXMath +import qualified Data.ByteString.Lazy as B +import Text.Pandoc.UTF8 ( fromStringLazy ) +import Codec.Archive.Zip +import Text.Pandoc.Options ( WriterOptions(..), WrapOption(..) ) +import Text.Pandoc.Shared ( stringify ) +import Text.Pandoc.ImageSize +import Text.Pandoc.MIME ( getMimeType, extensionFromMimeType ) +import Text.Pandoc.Definition +import Text.Pandoc.Walk +import Text.Pandoc.Writers.Shared ( fixDisplayMath ) +import Text.Pandoc.Writers.OpenDocument ( writeOpenDocument ) +import Control.Monad.State +import Control.Monad.Except (runExceptT) +import Text.Pandoc.Error (PandocError) +import Text.Pandoc.XML +import Text.Pandoc.Pretty +import System.FilePath ( takeExtension, takeDirectory, (<.>)) +import Text.Pandoc.Class ( PandocMonad, report ) +import qualified Text.Pandoc.Class as P +import Text.Pandoc.Logging + +data ODTState = ODTState { stEntries :: [Entry] + } + +type O m = StateT ODTState m + +-- | Produce an ODT file from a Pandoc document. +writeODT :: PandocMonad m + => WriterOptions -- ^ Writer options + -> Pandoc -- ^ Document to convert + -> m B.ByteString +writeODT opts doc = + let initState = ODTState{ stEntries = [] + } + in + evalStateT (pandocToODT opts doc) initState + +-- | Produce an ODT file from a Pandoc document. +pandocToODT :: PandocMonad m + => WriterOptions -- ^ Writer options + -> Pandoc -- ^ Document to convert + -> O m B.ByteString +pandocToODT opts doc@(Pandoc meta _) = do + let datadir = writerUserDataDir opts + let title = docTitle meta + refArchive <- + case writerReferenceDoc opts of + Just f -> liftM toArchive $ lift $ P.readFileLazy f + Nothing -> lift $ (toArchive . B.fromStrict) <$> + P.readDataFile datadir "reference.odt" + -- handle formulas and pictures + -- picEntriesRef <- P.newIORef ([] :: [Entry]) + doc' <- walkM (transformPicMath opts) $ walk fixDisplayMath doc + newContents <- lift $ writeOpenDocument opts{writerWrapText = WrapNone} doc' + epochtime <- floor `fmap` (lift P.getPOSIXTime) + let contentEntry = toEntry "content.xml" epochtime + $ fromStringLazy newContents + picEntries <- gets stEntries + let archive = foldr addEntryToArchive refArchive + $ contentEntry : picEntries + -- construct META-INF/manifest.xml based on archive + let toFileEntry fp = case getMimeType fp of + Nothing -> empty + Just m -> selfClosingTag "manifest:file-entry" + [("manifest:media-type", m) + ,("manifest:full-path", fp) + ,("manifest:version", "1.2") + ] + let files = [ ent | ent <- filesInArchive archive, + not ("META-INF" `isPrefixOf` ent) ] + let formulas = [ takeDirectory ent ++ "/" | ent <- filesInArchive archive, + "Formula-" `isPrefixOf` ent, takeExtension ent == ".xml" ] + let manifestEntry = toEntry "META-INF/manifest.xml" epochtime + $ fromStringLazy $ render Nothing + $ text "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + $$ + ( inTags True "manifest:manifest" + [("xmlns:manifest","urn:oasis:names:tc:opendocument:xmlns:manifest:1.0") + ,("manifest:version","1.2")] + $ ( selfClosingTag "manifest:file-entry" + [("manifest:media-type","application/vnd.oasis.opendocument.text") + ,("manifest:full-path","/")] + $$ vcat ( map toFileEntry $ files ) + $$ vcat ( map toFileEntry $ formulas ) + ) + ) + let archive' = addEntryToArchive manifestEntry archive + let metaEntry = toEntry "meta.xml" epochtime + $ fromStringLazy $ render Nothing + $ text "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + $$ + ( inTags True "office:document-meta" + [("xmlns:office","urn:oasis:names:tc:opendocument:xmlns:office:1.0") + ,("xmlns:xlink","http://www.w3.org/1999/xlink") + ,("xmlns:dc","http://purl.org/dc/elements/1.1/") + ,("xmlns:meta","urn:oasis:names:tc:opendocument:xmlns:meta:1.0") + ,("xmlns:ooo","http://openoffice.org/2004/office") + ,("xmlns:grddl","http://www.w3.org/2003/g/data-view#") + ,("office:version","1.2")] + $ ( inTagsSimple "office:meta" + $ ( inTagsSimple "dc:title" (text $ escapeStringForXML (stringify title)) + ) + ) + ) + -- make sure mimetype is first + let mimetypeEntry = toEntry "mimetype" epochtime + $ fromStringLazy "application/vnd.oasis.opendocument.text" + let archive'' = addEntryToArchive mimetypeEntry + $ addEntryToArchive metaEntry archive' + return $ fromArchive archive'' + +-- | transform both Image and Math elements +transformPicMath :: PandocMonad m => WriterOptions ->Inline -> O m Inline +transformPicMath opts (Image attr@(id', cls, _) lab (src,t)) = do + res <- runExceptT $ lift $ P.fetchItem (writerSourceURL opts) src + case res of + Left (_ :: PandocError) -> do + report $ CouldNotFetchResource src "" + return $ Emph lab + Right (img, mbMimeType) -> do + (ptX, ptY) <- case imageSize img of + Right s -> return $ sizeInPoints s + Left msg -> do + report $ CouldNotDetermineImageSize src msg + return (100, 100) + let dims = + case (getDim Width, getDim Height) of + (Just w, Just h) -> [("width", show w), ("height", show h)] + (Just w@(Percent _), Nothing) -> [("width", show w), ("style:rel-height", "scale")] + (Nothing, Just h@(Percent _)) -> [("style:rel-width", "scale"), ("height", show h)] + (Just w@(Inch i), Nothing) -> [("width", show w), ("height", show (i / ratio) ++ "in")] + (Nothing, Just h@(Inch i)) -> [("width", show (i * ratio) ++ "in"), ("height", show h)] + _ -> [("width", show ptX ++ "pt"), ("height", show ptY ++ "pt")] + where + ratio = ptX / ptY + getDim dir = case (dimension dir attr) of + Just (Percent i) -> Just $ Percent i + Just dim -> Just $ Inch $ inInch opts dim + Nothing -> Nothing + let newattr = (id', cls, dims) + entries <- gets stEntries + let extension = fromMaybe (takeExtension $ takeWhile (/='?') src) + (mbMimeType >>= extensionFromMimeType) + let newsrc = "Pictures/" ++ show (length entries) <.> extension + let toLazy = B.fromChunks . (:[]) + epochtime <- floor `fmap` (lift P.getPOSIXTime) + let entry = toEntry newsrc epochtime $ toLazy img + modify $ \st -> st{ stEntries = entry : entries } + return $ Image newattr lab (newsrc, t) +transformPicMath _ (Math t math) = do + entries <- gets stEntries + let dt = if t == InlineMath then DisplayInline else DisplayBlock + case writeMathML dt <$> readTeX math of + Left _ -> return $ Math t math + Right r -> do + let conf = useShortEmptyTags (const False) defaultConfigPP + let mathml = ppcTopElement conf r + epochtime <- floor `fmap` (lift $ P.getPOSIXTime) + let dirname = "Formula-" ++ show (length entries) ++ "/" + let fname = dirname ++ "content.xml" + let entry = toEntry fname epochtime (fromStringLazy mathml) + modify $ \st -> st{ stEntries = entry : entries } + return $ RawInline (Format "opendocument") $ render Nothing $ + inTags False "draw:frame" [("text:anchor-type", + if t == DisplayMath + then "paragraph" + else "as-char") + ,("style:vertical-pos", "middle") + ,("style:vertical-rel", "text")] $ + selfClosingTag "draw:object" [("xlink:href", dirname) + , ("xlink:type", "simple") + , ("xlink:show", "embed") + , ("xlink:actuate", "onLoad")] + +transformPicMath _ x = return x diff --git a/src/Text/Pandoc/Writers/OPML.hs b/src/Text/Pandoc/Writers/OPML.hs new file mode 100644 index 000000000..bc0cfc300 --- /dev/null +++ b/src/Text/Pandoc/Writers/OPML.hs @@ -0,0 +1,103 @@ +{-# LANGUAGE CPP #-} +{- +Copyright (C) 2013-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.OPML + Copyright : Copyright (C) 2013-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to OPML XML. +-} +module Text.Pandoc.Writers.OPML ( writeOPML) where +import Text.Pandoc.Definition +import Text.Pandoc.XML +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Shared +import Text.Pandoc.Options +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.Writers.HTML (writeHtml5String) +import Text.Pandoc.Writers.Markdown (writeMarkdown) +import Text.Pandoc.Pretty +import Text.Pandoc.Compat.Time +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Error +import Control.Monad.Except (throwError) +import Text.Pandoc.Class (PandocMonad) + +-- | Convert Pandoc document to string in OPML format. +writeOPML :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeOPML opts (Pandoc meta blocks) = do + let elements = hierarchicalize blocks + colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + meta' = B.setMeta "date" (B.str $ convertDate $ docDate meta) meta + metadata <- metaToJSON opts + (writeMarkdown def . Pandoc nullMeta) + (\ils -> trimr <$> (writeMarkdown def $ Pandoc nullMeta [Plain ils])) + meta' + main <- (render colwidth . vcat) <$> (mapM (elementToOPML opts) elements) + let context = defField "body" main metadata + return $ case writerTemplate opts of + Nothing -> main + Just tpl -> renderTemplate' tpl context + + +writeHtmlInlines :: PandocMonad m => [Inline] -> m String +writeHtmlInlines ils = + trim <$> (writeHtml5String def $ Pandoc nullMeta [Plain ils]) + +-- date format: RFC 822: Thu, 14 Jul 2005 23:41:05 GMT +showDateTimeRFC822 :: UTCTime -> String +showDateTimeRFC822 = formatTime defaultTimeLocale "%a, %d %b %Y %X %Z" + +convertDate :: [Inline] -> String +convertDate ils = maybe "" showDateTimeRFC822 $ +#if MIN_VERSION_time(1,5,0) + parseTimeM True +#else + parseTime +#endif + defaultTimeLocale "%F" =<< (normalizeDate $ stringify ils) + +-- | Convert an Element to OPML. +elementToOPML :: PandocMonad m => WriterOptions -> Element -> m Doc +elementToOPML _ (Blk _) = return empty +elementToOPML opts (Sec _ _num _ title elements) = do + let isBlk :: Element -> Bool + isBlk (Blk _) = True + isBlk _ = False + + fromBlk :: PandocMonad m => Element -> m Block + fromBlk (Blk x) = return x + fromBlk _ = throwError $ PandocSomeError "fromBlk called on non-block" + + (blocks, rest) = span isBlk elements + htmlIls <- writeHtmlInlines title + md <- if null blocks + then return [] + else do blks <- mapM fromBlk blocks + writeMarkdown def $ Pandoc nullMeta blks + let attrs = [("text", htmlIls)] ++ [("_note", md) | not (null blocks)] + o <- mapM (elementToOPML opts) rest + return $ inTags True "outline" attrs $ vcat o diff --git a/src/Text/Pandoc/Writers/OpenDocument.hs b/src/Text/Pandoc/Writers/OpenDocument.hs new file mode 100644 index 000000000..851e18b8e --- /dev/null +++ b/src/Text/Pandoc/Writers/OpenDocument.hs @@ -0,0 +1,626 @@ +{-# LANGUAGE PatternGuards, OverloadedStrings, FlexibleContexts #-} +{- +Copyright (C) 2008-2015 Andrea Rossato <andrea.rossato@ing.unitn.it> + and John MacFarlane. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.OpenDocument + Copyright : Copyright (C) 2008-2015 Andrea Rossato and John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : Andrea Rossato <andrea.rossato@ing.unitn.it> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to OpenDocument XML. +-} +module Text.Pandoc.Writers.OpenDocument ( writeOpenDocument ) where +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.XML +import Text.Pandoc.Shared (linesToPara) +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.Writers.Math +import Text.Pandoc.Pretty +import Text.Printf ( printf ) +import Control.Arrow ( (***), (>>>) ) +import Control.Monad.State hiding ( when ) +import Data.Char (chr) +import qualified Data.Set as Set +import qualified Data.Map as Map +import Text.Pandoc.Writers.Shared +import Data.List (sortBy) +import Data.Ord (comparing) +import Text.Pandoc.Class (PandocMonad, report) +import Text.Pandoc.Logging + +-- | Auxiliary function to convert Plain block to Para. +plainToPara :: Block -> Block +plainToPara (Plain x) = Para x +plainToPara x = x + +-- +-- OpenDocument writer +-- + +type OD m = StateT WriterState m + +data WriterState = + WriterState { stNotes :: [Doc] + , stTableStyles :: [Doc] + , stParaStyles :: [Doc] + , stListStyles :: [(Int, [Doc])] + , stTextStyles :: Map.Map (Set.Set TextStyle) (String, Doc) + , stTextStyleAttr :: Set.Set TextStyle + , stIndentPara :: Int + , stInDefinition :: Bool + , stTight :: Bool + , stFirstPara :: Bool + , stImageId :: Int + } + +defaultWriterState :: WriterState +defaultWriterState = + WriterState { stNotes = [] + , stTableStyles = [] + , stParaStyles = [] + , stListStyles = [] + , stTextStyles = Map.empty + , stTextStyleAttr = Set.empty + , stIndentPara = 0 + , stInDefinition = False + , stTight = False + , stFirstPara = False + , stImageId = 1 + } + +when :: Bool -> Doc -> Doc +when p a = if p then a else empty + +addTableStyle :: PandocMonad m => Doc -> OD m () +addTableStyle i = modify $ \s -> s { stTableStyles = i : stTableStyles s } + +addNote :: PandocMonad m => Doc -> OD m () +addNote i = modify $ \s -> s { stNotes = i : stNotes s } + +addParaStyle :: PandocMonad m => Doc -> OD m () +addParaStyle i = modify $ \s -> s { stParaStyles = i : stParaStyles s } + +addTextStyle :: PandocMonad m => Set.Set TextStyle -> (String, Doc) -> OD m () +addTextStyle attrs i = modify $ \s -> + s { stTextStyles = Map.insert attrs i (stTextStyles s) } + +addTextStyleAttr :: PandocMonad m => TextStyle -> OD m () +addTextStyleAttr t = modify $ \s -> + s { stTextStyleAttr = Set.insert t (stTextStyleAttr s) } + +increaseIndent :: PandocMonad m => OD m () +increaseIndent = modify $ \s -> s { stIndentPara = 1 + stIndentPara s } + +resetIndent :: PandocMonad m => OD m () +resetIndent = modify $ \s -> s { stIndentPara = (stIndentPara s) - 1 } + +inTightList :: PandocMonad m => OD m a -> OD m a +inTightList f = modify (\s -> s { stTight = True }) >> f >>= \r -> + modify (\s -> s { stTight = False }) >> return r + +setInDefinitionList :: PandocMonad m => Bool -> OD m () +setInDefinitionList b = modify $ \s -> s { stInDefinition = b } + +setFirstPara :: PandocMonad m => OD m () +setFirstPara = modify $ \s -> s { stFirstPara = True } + +inParagraphTags :: PandocMonad m => Doc -> OD m Doc +inParagraphTags d | isEmpty d = return empty +inParagraphTags d = do + b <- gets stFirstPara + a <- if b + then do modify $ \st -> st { stFirstPara = False } + return $ [("text:style-name", "First_20_paragraph")] + else return [("text:style-name", "Text_20_body")] + return $ inTags False "text:p" a d + +inParagraphTagsWithStyle :: String -> Doc -> Doc +inParagraphTagsWithStyle sty = inTags False "text:p" [("text:style-name", sty)] + +inSpanTags :: String -> Doc -> Doc +inSpanTags s = inTags False "text:span" [("text:style-name",s)] + +withTextStyle :: PandocMonad m => TextStyle -> OD m a -> OD m a +withTextStyle s f = do + oldTextStyleAttr <- gets stTextStyleAttr + addTextStyleAttr s + res <- f + modify $ \st -> st{ stTextStyleAttr = oldTextStyleAttr } + return res + +inTextStyle :: PandocMonad m => Doc -> OD m Doc +inTextStyle d = do + at <- gets stTextStyleAttr + if Set.null at + then return d + else do + styles <- gets stTextStyles + case Map.lookup at styles of + Just (styleName, _) -> return $ + inTags False "text:span" [("text:style-name",styleName)] d + Nothing -> do + let styleName = "T" ++ show (Map.size styles + 1) + addTextStyle at (styleName, + inTags False "style:style" + [("style:name", styleName) + ,("style:family", "text")] + $ selfClosingTag "style:text-properties" + (concatMap textStyleAttr (Set.toList at))) + return $ inTags False + "text:span" [("text:style-name",styleName)] d + +inHeaderTags :: PandocMonad m => Int -> Doc -> OD m Doc +inHeaderTags i d = + return $ inTags False "text:h" [ ("text:style-name", "Heading_20_" ++ show i) + , ("text:outline-level", show i)] d + +inQuotes :: QuoteType -> Doc -> Doc +inQuotes SingleQuote s = char '\8216' <> s <> char '\8217' +inQuotes DoubleQuote s = char '\8220' <> s <> char '\8221' + +handleSpaces :: String -> Doc +handleSpaces s + | ( ' ':_) <- s = genTag s + | ('\t':x) <- s = selfClosingTag "text:tab" [] <> rm x + | otherwise = rm s + where + genTag = span (==' ') >>> tag . length *** rm >>> uncurry (<>) + tag n = when (n /= 0) $ selfClosingTag "text:s" [("text:c", show n)] + rm ( ' ':xs) = char ' ' <> genTag xs + rm ('\t':xs) = selfClosingTag "text:tab" [] <> genTag xs + rm ( x:xs) = char x <> rm xs + rm [] = empty + +-- | Convert Pandoc document to string in OpenDocument format. +writeOpenDocument :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeOpenDocument opts (Pandoc meta blocks) = do + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + let render' = render colwidth + ((body, metadata),s) <- flip runStateT + defaultWriterState $ do + m <- metaToJSON opts + (fmap (render colwidth) . blocksToOpenDocument opts) + (fmap (render colwidth) . inlinesToOpenDocument opts) + meta + b <- render' `fmap` blocksToOpenDocument opts blocks + return (b, m) + let styles = stTableStyles s ++ stParaStyles s ++ + map snd (reverse $ sortBy (comparing fst) $ + Map.elems (stTextStyles s)) + listStyle (n,l) = inTags True "text:list-style" + [("style:name", "L" ++ show n)] (vcat l) + let listStyles = map listStyle (stListStyles s) + let automaticStyles = vcat $ reverse $ styles ++ listStyles + let context = defField "body" body + $ defField "automatic-styles" (render' automaticStyles) + $ metadata + return $ case writerTemplate opts of + Nothing -> body + Just tpl -> renderTemplate' tpl context + +withParagraphStyle :: PandocMonad m + => WriterOptions -> String -> [Block] -> OD m Doc +withParagraphStyle o s (b:bs) + | Para l <- b = go =<< inParagraphTagsWithStyle s <$> inlinesToOpenDocument o l + | otherwise = go =<< blockToOpenDocument o b + where go i = (<>) i <$> withParagraphStyle o s bs +withParagraphStyle _ _ [] = return empty + +inPreformattedTags :: PandocMonad m => String -> OD m Doc +inPreformattedTags s = do + n <- paraStyle [("style:parent-style-name","Preformatted_20_Text")] + return . inParagraphTagsWithStyle ("P" ++ show n) . handleSpaces $ s + +orderedListToOpenDocument :: PandocMonad m + => WriterOptions -> Int -> [[Block]] -> OD m Doc +orderedListToOpenDocument o pn bs = + vcat . map (inTagsIndented "text:list-item") <$> + mapM (orderedItemToOpenDocument o pn . map plainToPara) bs + +orderedItemToOpenDocument :: PandocMonad m + => WriterOptions -> Int -> [Block] -> OD m Doc +orderedItemToOpenDocument o n (b:bs) + | OrderedList a l <- b = newLevel a l + | Para l <- b = go =<< inParagraphTagsWithStyle ("P" ++ show n) <$> inlinesToOpenDocument o l + | otherwise = go =<< blockToOpenDocument o b + where + go i = ($$) i <$> orderedItemToOpenDocument o n bs + newLevel a l = do + nn <- length <$> gets stParaStyles + ls <- head <$> gets stListStyles + modify $ \s -> s { stListStyles = orderedListLevelStyle a ls : tail (stListStyles s) } + inTagsIndented "text:list" <$> orderedListToOpenDocument o nn l +orderedItemToOpenDocument _ _ [] = return empty + +isTightList :: [[Block]] -> Bool +isTightList [] = False +isTightList (b:_) + | Plain {} : _ <- b = True + | otherwise = False + +newOrderedListStyle :: PandocMonad m + => Bool -> ListAttributes -> OD m (Int,Int) +newOrderedListStyle b a = do + ln <- (+) 1 . length <$> gets stListStyles + let nbs = orderedListLevelStyle a (ln, []) + pn <- if b then inTightList (paraListStyle ln) else paraListStyle ln + modify $ \s -> s { stListStyles = nbs : stListStyles s } + return (ln,pn) + +bulletListToOpenDocument :: PandocMonad m + => WriterOptions -> [[Block]] -> OD m Doc +bulletListToOpenDocument o b = do + ln <- (+) 1 . length <$> gets stListStyles + (pn,ns) <- if isTightList b then inTightList (bulletListStyle ln) else bulletListStyle ln + modify $ \s -> s { stListStyles = ns : stListStyles s } + is <- listItemsToOpenDocument ("P" ++ show pn) o b + return $ inTags True "text:list" [("text:style-name", "L" ++ show ln)] is + +listItemsToOpenDocument :: PandocMonad m + => String -> WriterOptions -> [[Block]] -> OD m Doc +listItemsToOpenDocument s o is = + vcat . map (inTagsIndented "text:list-item") <$> mapM (withParagraphStyle o s . map plainToPara) is + +deflistItemToOpenDocument :: PandocMonad m + => WriterOptions -> ([Inline],[[Block]]) -> OD m Doc +deflistItemToOpenDocument o (t,d) = do + let ts = if isTightList d + then "Definition_20_Term_20_Tight" else "Definition_20_Term" + ds = if isTightList d + then "Definition_20_Definition_20_Tight" else "Definition_20_Definition" + t' <- withParagraphStyle o ts [Para t] + d' <- liftM vcat $ mapM (withParagraphStyle o ds . (map plainToPara)) d + return $ t' $$ d' + +inBlockQuote :: PandocMonad m + => WriterOptions -> Int -> [Block] -> OD m Doc +inBlockQuote o i (b:bs) + | BlockQuote l <- b = do increaseIndent + ni <- paraStyle + [("style:parent-style-name","Quotations")] + go =<< inBlockQuote o ni (map plainToPara l) + | Para l <- b = do go =<< inParagraphTagsWithStyle ("P" ++ show i) <$> inlinesToOpenDocument o l + | otherwise = do go =<< blockToOpenDocument o b + where go block = ($$) block <$> inBlockQuote o i bs +inBlockQuote _ _ [] = resetIndent >> return empty + +-- | Convert a list of Pandoc blocks to OpenDocument. +blocksToOpenDocument :: PandocMonad m => WriterOptions -> [Block] -> OD m Doc +blocksToOpenDocument o b = vcat <$> mapM (blockToOpenDocument o) b + +-- | Convert a Pandoc block element to OpenDocument. +blockToOpenDocument :: PandocMonad m => WriterOptions -> Block -> OD m Doc +blockToOpenDocument o bs + | Plain b <- bs = if null b + then return empty + else inParagraphTags =<< inlinesToOpenDocument o b + | Para [Image attr c (s,'f':'i':'g':':':t)] <- bs + = figure attr c s t + | Para b <- bs = if null b + then return empty + else inParagraphTags =<< inlinesToOpenDocument o b + | LineBlock b <- bs = blockToOpenDocument o $ linesToPara b + | Div _ xs <- bs = blocksToOpenDocument o xs + | Header i _ b <- bs = setFirstPara >> + (inHeaderTags i =<< inlinesToOpenDocument o b) + | BlockQuote b <- bs = setFirstPara >> mkBlockQuote b + | DefinitionList b <- bs = setFirstPara >> defList b + | BulletList b <- bs = setFirstPara >> bulletListToOpenDocument o b + | OrderedList a b <- bs = setFirstPara >> orderedList a b + | CodeBlock _ s <- bs = setFirstPara >> preformatted s + | Table c a w h r <- bs = setFirstPara >> table c a w h r + | HorizontalRule <- bs = setFirstPara >> return (selfClosingTag "text:p" + [ ("text:style-name", "Horizontal_20_Line") ]) + | RawBlock f s <- bs = if f == Format "opendocument" + then return $ text s + else do + report $ BlockNotRendered bs + return empty + | Null <- bs = return empty + | otherwise = return empty + where + defList b = do setInDefinitionList True + r <- vcat <$> mapM (deflistItemToOpenDocument o) b + setInDefinitionList False + return r + preformatted s = (flush . vcat) <$> mapM (inPreformattedTags . escapeStringForXML) (lines s) + mkBlockQuote b = do increaseIndent + i <- paraStyle + [("style:parent-style-name","Quotations")] + inBlockQuote o i (map plainToPara b) + orderedList a b = do (ln,pn) <- newOrderedListStyle (isTightList b) a + inTags True "text:list" [ ("text:style-name", "L" ++ show ln)] + <$> orderedListToOpenDocument o pn b + table c a w h r = do + tn <- length <$> gets stTableStyles + pn <- length <$> gets stParaStyles + let genIds = map chr [65..] + name = "Table" ++ show (tn + 1) + columnIds = zip genIds w + mkColumn n = selfClosingTag "table:table-column" [("table:style-name", name ++ "." ++ [fst n])] + columns = map mkColumn columnIds + paraHStyles = paraTableStyles "Heading" pn a + paraStyles = paraTableStyles "Contents" (pn + length (newPara paraHStyles)) a + newPara = map snd . filter (not . isEmpty . snd) + addTableStyle $ tableStyle tn columnIds + mapM_ addParaStyle . newPara $ paraHStyles ++ paraStyles + captionDoc <- if null c + then return empty + else withParagraphStyle o "Table" [Para c] + th <- if all null h + then return empty + else colHeadsToOpenDocument o name (map fst paraHStyles) h + tr <- mapM (tableRowToOpenDocument o name (map fst paraStyles)) r + return $ inTags True "table:table" [ ("table:name" , name) + , ("table:style-name", name) + ] (vcat columns $$ th $$ vcat tr) $$ captionDoc + figure attr caption source title | null caption = + withParagraphStyle o "Figure" [Para [Image attr caption (source,title)]] + | otherwise = do + imageDoc <- withParagraphStyle o "FigureWithCaption" [Para [Image attr caption (source,title)]] + captionDoc <- withParagraphStyle o "FigureCaption" [Para caption] + return $ imageDoc $$ captionDoc + +colHeadsToOpenDocument :: PandocMonad m + => WriterOptions -> String -> [String] -> [[Block]] + -> OD m Doc +colHeadsToOpenDocument o tn ns hs = + inTagsIndented "table:table-header-rows" . inTagsIndented "table:table-row" . vcat <$> + mapM (tableItemToOpenDocument o tn) (zip ns hs) + +tableRowToOpenDocument :: PandocMonad m + => WriterOptions -> String -> [String] -> [[Block]] + -> OD m Doc +tableRowToOpenDocument o tn ns cs = + inTagsIndented "table:table-row" . vcat <$> + mapM (tableItemToOpenDocument o tn) (zip ns cs) + +tableItemToOpenDocument :: PandocMonad m + => WriterOptions -> String -> (String,[Block]) + -> OD m Doc +tableItemToOpenDocument o tn (n,i) = + let a = [ ("table:style-name" , tn ++ ".A1" ) + , ("office:value-type", "string" ) + ] + in inTags True "table:table-cell" a <$> + withParagraphStyle o n (map plainToPara i) + +-- | Convert a list of inline elements to OpenDocument. +inlinesToOpenDocument :: PandocMonad m => WriterOptions -> [Inline] -> OD m Doc +inlinesToOpenDocument o l = hcat <$> toChunks o l + +toChunks :: PandocMonad m => WriterOptions -> [Inline] -> OD m [Doc] +toChunks _ [] = return [] +toChunks o (x : xs) + | isChunkable x = do + contents <- (inTextStyle . hcat) =<< + mapM (inlineToOpenDocument o) (x:ys) + rest <- toChunks o zs + return (contents : rest) + | otherwise = do + contents <- inlineToOpenDocument o x + rest <- toChunks o xs + return (contents : rest) + where (ys, zs) = span isChunkable xs + +isChunkable :: Inline -> Bool +isChunkable (Str _) = True +isChunkable Space = True +isChunkable SoftBreak = True +isChunkable _ = False + +-- | Convert an inline element to OpenDocument. +inlineToOpenDocument :: PandocMonad m => WriterOptions -> Inline -> OD m Doc +inlineToOpenDocument o ils + = case ils of + Space -> return space + SoftBreak + | writerWrapText o == WrapPreserve + -> return $ preformatted "\n" + | otherwise -> return $ space + Span _ xs -> inlinesToOpenDocument o xs + LineBreak -> return $ selfClosingTag "text:line-break" [] + Str s -> return $ handleSpaces $ escapeStringForXML s + Emph l -> withTextStyle Italic $ inlinesToOpenDocument o l + Strong l -> withTextStyle Bold $ inlinesToOpenDocument o l + Strikeout l -> withTextStyle Strike $ inlinesToOpenDocument o l + Superscript l -> withTextStyle Sup $ inlinesToOpenDocument o l + Subscript l -> withTextStyle Sub $ inlinesToOpenDocument o l + SmallCaps l -> withTextStyle SmallC $ inlinesToOpenDocument o l + Quoted t l -> inQuotes t <$> inlinesToOpenDocument o l + Code _ s -> inlinedCode $ preformatted s + Math t s -> lift (texMathToInlines t s) >>= + inlinesToOpenDocument o + Cite _ l -> inlinesToOpenDocument o l + RawInline f s -> if f == Format "opendocument" + then return $ text s + else do + report $ InlineNotRendered ils + return empty + Link _ l (s,t) -> mkLink s t <$> inlinesToOpenDocument o l + Image attr _ (s,t) -> mkImg attr s t + Note l -> mkNote l + where + preformatted s = handleSpaces $ escapeStringForXML s + inlinedCode s = return $ inTags False "text:span" [("text:style-name", "Source_Text")] s + mkLink s t = inTags False "text:a" [ ("xlink:type" , "simple") + , ("xlink:href" , s ) + , ("office:name", t ) + ] . inSpanTags "Definition" + mkImg (_, _, kvs) s _ = do + id' <- gets stImageId + modify (\st -> st{ stImageId = id' + 1 }) + let getDims [] = [] + getDims (("width", w) :xs) = ("svg:width", w) : getDims xs + getDims (("height", h):xs) = ("svg:height", h) : getDims xs + getDims (x@("style:rel-width", _) :xs) = x : getDims xs + getDims (x@("style:rel-height", _):xs) = x : getDims xs + getDims (_:xs) = getDims xs + return $ inTags False "draw:frame" + (("draw:name", "img" ++ show id') : getDims kvs) $ + selfClosingTag "draw:image" [ ("xlink:href" , s ) + , ("xlink:type" , "simple") + , ("xlink:show" , "embed" ) + , ("xlink:actuate", "onLoad")] + mkNote l = do + n <- length <$> gets stNotes + let footNote t = inTags False "text:note" + [ ("text:id" , "ftn" ++ show n) + , ("text:note-class", "footnote" )] $ + inTagsSimple "text:note-citation" (text . show $ n + 1) <> + inTagsSimple "text:note-body" t + nn <- footNote <$> withParagraphStyle o "Footnote" l + addNote nn + return nn + +bulletListStyle :: PandocMonad m => Int -> OD m (Int,(Int,[Doc])) +bulletListStyle l = do + let doStyles i = inTags True "text:list-level-style-bullet" + [ ("text:level" , show (i + 1) ) + , ("text:style-name" , "Bullet_20_Symbols") + , ("style:num-suffix", "." ) + , ("text:bullet-char", [bulletList !! i] ) + ] (listLevelStyle (1 + i)) + bulletList = map chr $ cycle [8226,8227,8259] + listElStyle = map doStyles [0..9] + pn <- paraListStyle l + return (pn, (l, listElStyle)) + +orderedListLevelStyle :: ListAttributes -> (Int, [Doc]) -> (Int,[Doc]) +orderedListLevelStyle (s,n, d) (l,ls) = + let suffix = case d of + OneParen -> [("style:num-suffix", ")")] + TwoParens -> [("style:num-prefix", "(") + ,("style:num-suffix", ")")] + _ -> [("style:num-suffix", ".")] + format = case n of + UpperAlpha -> "A" + LowerAlpha -> "a" + UpperRoman -> "I" + LowerRoman -> "i" + _ -> "1" + listStyle = inTags True "text:list-level-style-number" + ([ ("text:level" , show $ 1 + length ls ) + , ("text:style-name" , "Numbering_20_Symbols") + , ("style:num-format", format ) + , ("text:start-value", show s ) + ] ++ suffix) (listLevelStyle (1 + length ls)) + in (l, ls ++ [listStyle]) + +listLevelStyle :: Int -> Doc +listLevelStyle i = + let indent = show (0.25 * fromIntegral i :: Double) in + selfClosingTag "style:list-level-properties" + [ ("text:space-before" , indent ++ "in") + , ("text:min-label-width", "0.25in")] + +tableStyle :: Int -> [(Char,Double)] -> Doc +tableStyle num wcs = + let tableId = "Table" ++ show (num + 1) + table = inTags True "style:style" + [("style:name", tableId) + ,("style:family", "table")] $ + selfClosingTag "style:table-properties" + [("table:align" , "center")] + colStyle (c,0) = selfClosingTag "style:style" + [ ("style:name" , tableId ++ "." ++ [c]) + , ("style:family", "table-column" )] + colStyle (c,w) = inTags True "style:style" + [ ("style:name" , tableId ++ "." ++ [c]) + , ("style:family", "table-column" )] $ + selfClosingTag "style:table-column-properties" + [("style:rel-column-width", printf "%d*" $ (floor $ w * 65535 :: Integer))] + cellStyle = inTags True "style:style" + [ ("style:name" , tableId ++ ".A1") + , ("style:family", "table-cell" )] $ + selfClosingTag "style:table-cell-properties" + [ ("fo:border", "none")] + columnStyles = map colStyle wcs + in table $$ vcat columnStyles $$ cellStyle + +paraStyle :: PandocMonad m => [(String,String)] -> OD m Int +paraStyle attrs = do + pn <- (+) 1 . length <$> gets stParaStyles + i <- (*) (0.5 :: Double) . fromIntegral <$> gets stIndentPara + b <- gets stInDefinition + t <- gets stTight + let styleAttr = [ ("style:name" , "P" ++ show pn) + , ("style:family" , "paragraph" )] + indentVal = flip (++) "in" . show $ if b then (max 0.5 i) else i + tight = if t then [ ("fo:margin-top" , "0in" ) + , ("fo:margin-bottom" , "0in" )] + else [] + indent = if (i /= 0 || b) + then [ ("fo:margin-left" , indentVal) + , ("fo:margin-right" , "0in" ) + , ("fo:text-indent" , "0in" ) + , ("style:auto-text-indent" , "false" )] + else [] + attributes = indent ++ tight + paraProps = when (not $ null attributes) $ + selfClosingTag "style:paragraph-properties" attributes + addParaStyle $ inTags True "style:style" (styleAttr ++ attrs) paraProps + return pn + +paraListStyle :: PandocMonad m => Int -> OD m Int +paraListStyle l = paraStyle + [("style:parent-style-name","Text_20_body") + ,("style:list-style-name", "L" ++ show l )] + +paraTableStyles :: String -> Int -> [Alignment] -> [(String, Doc)] +paraTableStyles _ _ [] = [] +paraTableStyles t s (a:xs) + | AlignRight <- a = ( pName s, res s "end" ) : paraTableStyles t (s + 1) xs + | AlignCenter <- a = ( pName s, res s "center") : paraTableStyles t (s + 1) xs + | otherwise = ("Table_20_" ++ t, empty ) : paraTableStyles t s xs + where pName sn = "P" ++ show (sn + 1) + res sn x = inTags True "style:style" + [ ("style:name" , pName sn ) + , ("style:family" , "paragraph" ) + , ("style:parent-style-name", "Table_20_" ++ t)] $ + selfClosingTag "style:paragraph-properties" + [ ("fo:text-align", x) + , ("style:justify-single-word", "false")] + +data TextStyle = Italic | Bold | Strike | Sub | Sup | SmallC | Pre + deriving ( Eq,Ord ) + +textStyleAttr :: TextStyle -> [(String,String)] +textStyleAttr s + | Italic <- s = [("fo:font-style" ,"italic" ) + ,("style:font-style-asian" ,"italic" ) + ,("style:font-style-complex" ,"italic" )] + | Bold <- s = [("fo:font-weight" ,"bold" ) + ,("style:font-weight-asian" ,"bold" ) + ,("style:font-weight-complex" ,"bold" )] + | Strike <- s = [("style:text-line-through-style", "solid" )] + | Sub <- s = [("style:text-position" ,"sub 58%" )] + | Sup <- s = [("style:text-position" ,"super 58%" )] + | SmallC <- s = [("fo:font-variant" ,"small-caps")] + | Pre <- s = [("style:font-name" ,"Courier New") + ,("style:font-name-asian" ,"Courier New") + ,("style:font-name-complex" ,"Courier New")] + | otherwise = [] diff --git a/src/Text/Pandoc/Writers/Org.hs b/src/Text/Pandoc/Writers/Org.hs new file mode 100644 index 000000000..55d3fe656 --- /dev/null +++ b/src/Text/Pandoc/Writers/Org.hs @@ -0,0 +1,411 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2010-2015 Puneeth Chaganti <punchagan@gmail.com> + Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>, + and John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Org + Copyright : Copyright (C) 2010-2015 Puneeth Chaganti and John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : Puneeth Chaganti <punchagan@gmail.com> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to Emacs Org-Mode. + +Org-Mode: <http://orgmode.org> +-} +module Text.Pandoc.Writers.Org ( writeOrg) where +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Pretty +import Text.Pandoc.Templates (renderTemplate') +import Data.Char ( isAlphaNum, toLower ) +import Data.List ( isPrefixOf, intersect, intersperse, partition, transpose ) +import Control.Monad.State +import Text.Pandoc.Class (PandocMonad) + +data WriterState = + WriterState { stNotes :: [[Block]] + , stHasMath :: Bool + , stOptions :: WriterOptions + } + +-- | Convert Pandoc to Org. +writeOrg :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeOrg opts document = return $ + let st = WriterState { stNotes = [], + stHasMath = False, + stOptions = opts } + in evalState (pandocToOrg document) st + +-- | Return Org representation of document. +pandocToOrg :: Pandoc -> State WriterState String +pandocToOrg (Pandoc meta blocks) = do + opts <- liftM stOptions get + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + metadata <- metaToJSON opts + (fmap (render colwidth) . blockListToOrg) + (fmap (render colwidth) . inlineListToOrg) + meta + body <- blockListToOrg blocks + notes <- liftM (reverse . stNotes) get >>= notesToOrg + hasMath <- liftM stHasMath get + let main = render colwidth $ foldl ($+$) empty $ [body, notes] + let context = defField "body" main + $ defField "math" hasMath + $ metadata + case writerTemplate opts of + Nothing -> return main + Just tpl -> return $ renderTemplate' tpl context + +-- | Return Org representation of notes. +notesToOrg :: [[Block]] -> State WriterState Doc +notesToOrg notes = + mapM (\(num, note) -> noteToOrg num note) (zip [1..] notes) >>= + return . vsep + +-- | Return Org representation of a note. +noteToOrg :: Int -> [Block] -> State WriterState Doc +noteToOrg num note = do + contents <- blockListToOrg note + let marker = "[fn:" ++ show num ++ "] " + return $ hang (length marker) (text marker) contents + +-- | Escape special characters for Org. +escapeString :: String -> String +escapeString = escapeStringUsing $ + [ ('\x2014',"---") + , ('\x2013',"--") + , ('\x2019',"'") + , ('\x2026',"...") + ] ++ backslashEscapes "^_" + +isRawFormat :: Format -> Bool +isRawFormat f = + f == Format "latex" || f == Format "tex" || f == Format "org" + +-- | Convert Pandoc block element to Org. +blockToOrg :: Block -- ^ Block element + -> State WriterState Doc +blockToOrg Null = return empty +blockToOrg (Div (_,classes@(cls:_),kvs) bs) | "drawer" `elem` classes = do + contents <- blockListToOrg bs + let drawerNameTag = ":" <> text cls <> ":" + let keys = vcat $ map (\(k,v) -> + ":" <> text k <> ":" + <> space <> text v) kvs + let drawerEndTag = text ":END:" + return $ drawerNameTag $$ cr $$ keys $$ + blankline $$ contents $$ + blankline $$ drawerEndTag $$ + blankline +blockToOrg (Div attrs bs) = do + contents <- blockListToOrg bs + let isGreaterBlockClass = (`elem` ["center", "quote"]) . map toLower + return $ case attrs of + ("", [], []) -> + -- nullAttr, treat contents as if it wasn't wrapped + blankline $$ contents $$ blankline + (ident, [], []) -> + -- only an id: add id as an anchor, unwrap the rest + blankline $$ "<<" <> text ident <> ">>" $$ contents $$ blankline + (ident, classes, kv) -> + -- if one class looks like the name of a greater block then output as + -- such: The ID, if present, is added via the #+NAME keyword; other + -- classes and key-value pairs are kept as #+ATTR_HTML attributes. + let + (blockTypeCand, classes') = partition isGreaterBlockClass classes + in case blockTypeCand of + (blockType:classes'') -> + blankline $$ attrHtml (ident, classes'' <> classes', kv) $$ + "#+BEGIN_" <> text blockType $$ contents $$ + "#+END_" <> text blockType $$ blankline + _ -> + -- fallback: wrap in div tags + let + startTag = tagWithAttrs "div" attrs + endTag = text "</div>" + in blankline $$ "#+BEGIN_HTML" $$ + nest 2 startTag $$ "#+END_HTML" $$ blankline $$ + contents $$ blankline $$ "#+BEGIN_HTML" $$ + nest 2 endTag $$ "#+END_HTML" $$ blankline +blockToOrg (Plain inlines) = inlineListToOrg inlines +-- title beginning with fig: indicates that the image is a figure +blockToOrg (Para [Image attr txt (src,'f':'i':'g':':':tit)]) = do + capt <- if null txt + then return empty + else ("#+CAPTION: " <>) `fmap` inlineListToOrg txt + img <- inlineToOrg (Image attr txt (src,tit)) + return $ capt $$ img $$ blankline +blockToOrg (Para inlines) = do + contents <- inlineListToOrg inlines + return $ contents <> blankline +blockToOrg (LineBlock lns) = do + let splitStanza [] = [] + splitStanza xs = case break (== mempty) xs of + (l, []) -> l : [] + (l, _:r) -> l : splitStanza r + let joinWithLinefeeds = nowrap . mconcat . intersperse cr + let joinWithBlankLines = mconcat . intersperse blankline + let prettyfyStanza ls = joinWithLinefeeds <$> mapM inlineListToOrg ls + contents <- joinWithBlankLines <$> mapM prettyfyStanza (splitStanza lns) + return $ blankline $$ "#+BEGIN_VERSE" $$ + nest 2 contents $$ "#+END_VERSE" <> blankline +blockToOrg (RawBlock "html" str) = + return $ blankline $$ "#+BEGIN_HTML" $$ + nest 2 (text str) $$ "#+END_HTML" $$ blankline +blockToOrg (RawBlock f str) | isRawFormat f = + return $ text str +blockToOrg (RawBlock _ _) = return empty +blockToOrg HorizontalRule = return $ blankline $$ "--------------" $$ blankline +blockToOrg (Header level attr inlines) = do + contents <- inlineListToOrg inlines + let headerStr = text $ if level > 999 then " " else replicate level '*' + let drawerStr = if attr == nullAttr + then empty + else cr <> nest (level + 1) (propertiesDrawer attr) + return $ headerStr <> " " <> contents <> drawerStr <> blankline +blockToOrg (CodeBlock (_,classes,_) str) = do + opts <- stOptions <$> get + let tabstop = writerTabStop opts + let at = map pandocLangToOrg classes `intersect` orgLangIdentifiers + let (beg, end) = case at of + [] -> ("#+BEGIN_EXAMPLE", "#+END_EXAMPLE") + (x:_) -> ("#+BEGIN_SRC " ++ x, "#+END_SRC") + return $ text beg $$ nest tabstop (text str) $$ text end $$ blankline +blockToOrg (BlockQuote blocks) = do + contents <- blockListToOrg blocks + return $ blankline $$ "#+BEGIN_QUOTE" $$ + nest 2 contents $$ "#+END_QUOTE" $$ blankline +blockToOrg (Table caption' _ _ headers rows) = do + caption'' <- inlineListToOrg caption' + let caption = if null caption' + then empty + else ("#+CAPTION: " <> caption'') + headers' <- mapM blockListToOrg headers + rawRows <- mapM (mapM blockListToOrg) rows + let numChars = maximum . map offset + -- FIXME: width is not being used. + let widthsInChars = + map ((+2) . numChars) $ transpose (headers' : rawRows) + -- FIXME: Org doesn't allow blocks with height more than 1. + let hpipeBlocks blocks = hcat [beg, middle, end] + where h = maximum (1 : map height blocks) + sep' = lblock 3 $ vcat (map text $ replicate h " | ") + beg = lblock 2 $ vcat (map text $ replicate h "| ") + end = lblock 2 $ vcat (map text $ replicate h " |") + middle = hcat $ intersperse sep' blocks + let makeRow = hpipeBlocks . zipWith lblock widthsInChars + let head' = makeRow headers' + rows' <- mapM (\row -> do cols <- mapM blockListToOrg row + return $ makeRow cols) rows + let border ch = char '|' <> char ch <> + (hcat $ intersperse (char ch <> char '+' <> char ch) $ + map (\l -> text $ replicate l ch) widthsInChars) <> + char ch <> char '|' + let body = vcat rows' + let head'' = if all null headers + then empty + else head' $$ border '-' + return $ head'' $$ body $$ caption $$ blankline +blockToOrg (BulletList items) = do + contents <- mapM bulletListItemToOrg items + -- ensure that sublists have preceding blank line + return $ blankline $+$ vcat contents $$ blankline +blockToOrg (OrderedList (start, _, delim) items) = do + let delim' = case delim of + TwoParens -> OneParen + x -> x + let markers = take (length items) $ orderedListMarkers + (start, Decimal, delim') + let maxMarkerLength = maximum $ map length markers + let markers' = map (\m -> let s = maxMarkerLength - length m + in m ++ replicate s ' ') markers + contents <- mapM (\(item, num) -> orderedListItemToOrg item num) $ + zip markers' items + -- ensure that sublists have preceding blank line + return $ blankline $$ vcat contents $$ blankline +blockToOrg (DefinitionList items) = do + contents <- mapM definitionListItemToOrg items + return $ vcat contents $$ blankline + +-- | Convert bullet list item (list of blocks) to Org. +bulletListItemToOrg :: [Block] -> State WriterState Doc +bulletListItemToOrg items = do + contents <- blockListToOrg items + return $ hang 2 "- " (contents <> cr) + +-- | Convert ordered list item (a list of blocks) to Org. +orderedListItemToOrg :: String -- ^ marker for list item + -> [Block] -- ^ list item (list of blocks) + -> State WriterState Doc +orderedListItemToOrg marker items = do + contents <- blockListToOrg items + return $ hang (length marker + 1) (text marker <> space) (contents <> cr) + +-- | Convert defintion list item (label, list of blocks) to Org. +definitionListItemToOrg :: ([Inline], [[Block]]) -> State WriterState Doc +definitionListItemToOrg (label, defs) = do + label' <- inlineListToOrg label + contents <- liftM vcat $ mapM blockListToOrg defs + return $ hang 2 "- " $ label' <> " :: " <> (contents <> cr) + +-- | Convert list of key/value pairs to Org :PROPERTIES: drawer. +propertiesDrawer :: Attr -> Doc +propertiesDrawer (ident, classes, kv) = + let + drawerStart = text ":PROPERTIES:" + drawerEnd = text ":END:" + kv' = if (classes == mempty) then kv else ("CLASS", unwords classes):kv + kv'' = if (ident == mempty) then kv' else ("CUSTOM_ID", ident):kv' + properties = vcat $ map kvToOrgProperty kv'' + in + drawerStart <> cr <> properties <> cr <> drawerEnd + where + kvToOrgProperty :: (String, String) -> Doc + kvToOrgProperty (key, value) = + text ":" <> text key <> text ": " <> text value <> cr + +attrHtml :: Attr -> Doc +attrHtml ("" , [] , []) = mempty +attrHtml (ident, classes, kvs) = + let + name = if (null ident) then mempty else "#+NAME: " <> text ident <> cr + keyword = "#+ATTR_HTML" + classKv = ("class", unwords classes) + kvStrings = map (\(k,v) -> ":" <> k <> " " <> v) (classKv:kvs) + in name <> keyword <> ": " <> text (unwords kvStrings) <> cr + +-- | Convert list of Pandoc block elements to Org. +blockListToOrg :: [Block] -- ^ List of block elements + -> State WriterState Doc +blockListToOrg blocks = mapM blockToOrg blocks >>= return . vcat + +-- | Convert list of Pandoc inline elements to Org. +inlineListToOrg :: [Inline] -> State WriterState Doc +inlineListToOrg lst = mapM inlineToOrg lst >>= return . hcat + +-- | Convert Pandoc inline element to Org. +inlineToOrg :: Inline -> State WriterState Doc +inlineToOrg (Span (uid, [], []) []) = + return $ "<<" <> text uid <> ">>" +inlineToOrg (Span _ lst) = + inlineListToOrg lst +inlineToOrg (Emph lst) = do + contents <- inlineListToOrg lst + return $ "/" <> contents <> "/" +inlineToOrg (Strong lst) = do + contents <- inlineListToOrg lst + return $ "*" <> contents <> "*" +inlineToOrg (Strikeout lst) = do + contents <- inlineListToOrg lst + return $ "+" <> contents <> "+" +inlineToOrg (Superscript lst) = do + contents <- inlineListToOrg lst + return $ "^{" <> contents <> "}" +inlineToOrg (Subscript lst) = do + contents <- inlineListToOrg lst + return $ "_{" <> contents <> "}" +inlineToOrg (SmallCaps lst) = inlineListToOrg lst +inlineToOrg (Quoted SingleQuote lst) = do + contents <- inlineListToOrg lst + return $ "'" <> contents <> "'" +inlineToOrg (Quoted DoubleQuote lst) = do + contents <- inlineListToOrg lst + return $ "\"" <> contents <> "\"" +inlineToOrg (Cite _ lst) = inlineListToOrg lst +inlineToOrg (Code _ str) = return $ "=" <> text str <> "=" +inlineToOrg (Str str) = return $ text $ escapeString str +inlineToOrg (Math t str) = do + modify $ \st -> st{ stHasMath = True } + return $ if t == InlineMath + then "$" <> text str <> "$" + else "$$" <> text str <> "$$" +inlineToOrg (RawInline f@(Format f') str) = + return $ if isRawFormat f + then text str + else "@@" <> text f' <> ":" <> text str <> "@@" +inlineToOrg LineBreak = return (text "\\\\" <> cr) +inlineToOrg Space = return space +inlineToOrg SoftBreak = do + wrapText <- gets (writerWrapText . stOptions) + case wrapText of + WrapPreserve -> return cr + WrapAuto -> return space + WrapNone -> return space +inlineToOrg (Link _ txt (src, _)) = do + case txt of + [Str x] | escapeURI x == src -> -- autolink + do return $ "[[" <> text (orgPath x) <> "]]" + _ -> do contents <- inlineListToOrg txt + return $ "[[" <> text (orgPath src) <> "][" <> contents <> "]]" +inlineToOrg (Image _ _ (source, _)) = do + return $ "[[" <> text (orgPath source) <> "]]" +inlineToOrg (Note contents) = do + -- add to notes in state + notes <- get >>= (return . stNotes) + modify $ \st -> st { stNotes = contents:notes } + let ref = show $ (length notes) + 1 + return $ "[fn:" <> text ref <> "]" + +orgPath :: String -> String +orgPath src = + case src of + [] -> mempty -- wiki link + ('#':_) -> src -- internal link + _ | isUrl src -> src + _ | isFilePath src -> src + _ -> "file:" <> src + where + isFilePath :: String -> Bool + isFilePath cs = any (`isPrefixOf` cs) ["/", "./", "../", "file:"] + + isUrl :: String -> Bool + isUrl cs = + let (scheme, path) = break (== ':') cs + in all (\c -> isAlphaNum c || c `elem` (".-"::String)) scheme + && not (null path) + +-- | Translate from pandoc's programming language identifiers to those used by +-- org-mode. +pandocLangToOrg :: String -> String +pandocLangToOrg cs = + case cs of + "c" -> "C" + "cpp" -> "C++" + "commonlisp" -> "lisp" + "r" -> "R" + "bash" -> "sh" + _ -> cs + +-- | List of language identifiers recognized by org-mode. +orgLangIdentifiers :: [String] +orgLangIdentifiers = + [ "asymptote", "awk", "C", "C++", "clojure", "css", "d", "ditaa", "dot" + , "calc", "emacs-lisp", "fortran", "gnuplot", "haskell", "java", "js" + , "latex", "ledger", "lisp", "lilypond", "matlab", "mscgen", "ocaml" + , "octave", "org", "oz", "perl", "plantuml", "processing", "python", "R" + , "ruby", "sass", "scheme", "screen", "sed", "sh", "sql", "sqlite" + ] diff --git a/src/Text/Pandoc/Writers/RST.hs b/src/Text/Pandoc/Writers/RST.hs new file mode 100644 index 000000000..5cce64d17 --- /dev/null +++ b/src/Text/Pandoc/Writers/RST.hs @@ -0,0 +1,556 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.RST + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to reStructuredText. + +reStructuredText: <http://docutils.sourceforge.net/rst.html> +-} +module Text.Pandoc.Writers.RST ( writeRST ) where +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.ImageSize +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.Builder (deleteMeta) +import Data.Maybe (fromMaybe) +import Data.List ( isPrefixOf, stripPrefix, intersperse, transpose ) +import Network.URI (isURI) +import Text.Pandoc.Pretty +import Control.Monad.State +import Data.Char (isSpace, toLower) +import Text.Pandoc.Class (PandocMonad) + +type Refs = [([Inline], Target)] + +data WriterState = + WriterState { stNotes :: [[Block]] + , stLinks :: Refs + , stImages :: [([Inline], (Attr, String, String, Maybe String))] + , stHasMath :: Bool + , stHasRawTeX :: Bool + , stOptions :: WriterOptions + , stTopLevel :: Bool + } + +-- | Convert Pandoc to RST. +writeRST :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeRST opts document = return $ + let st = WriterState { stNotes = [], stLinks = [], + stImages = [], stHasMath = False, + stHasRawTeX = False, stOptions = opts, + stTopLevel = True} + in evalState (pandocToRST document) st + +-- | Return RST representation of document. +pandocToRST :: Pandoc -> State WriterState String +pandocToRST (Pandoc meta blocks) = do + opts <- liftM stOptions get + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + let subtit = case lookupMeta "subtitle" meta of + Just (MetaBlocks [Plain xs]) -> xs + _ -> [] + title <- titleToRST (docTitle meta) subtit + metadata <- metaToJSON opts + (fmap (render colwidth) . blockListToRST) + (fmap (trimr . render colwidth) . inlineListToRST) + $ deleteMeta "title" $ deleteMeta "subtitle" meta + body <- blockListToRST' True $ case writerTemplate opts of + Just _ -> normalizeHeadings 1 blocks + Nothing -> blocks + notes <- liftM (reverse . stNotes) get >>= notesToRST + -- note that the notes may contain refs, so we do them first + refs <- liftM (reverse . stLinks) get >>= refsToRST + pics <- liftM (reverse . stImages) get >>= pictRefsToRST + hasMath <- liftM stHasMath get + rawTeX <- liftM stHasRawTeX get + let main = render colwidth $ foldl ($+$) empty $ [body, notes, refs, pics] + let context = defField "body" main + $ defField "toc" (writerTableOfContents opts) + $ defField "toc-depth" (show $ writerTOCDepth opts) + $ defField "math" hasMath + $ defField "title" (render Nothing title :: String) + $ defField "math" hasMath + $ defField "rawtex" rawTeX + $ metadata + case writerTemplate opts of + Nothing -> return main + Just tpl -> return $ renderTemplate' tpl context + where + normalizeHeadings lev (Header l a i:bs) = + Header lev a i:normalizeHeadings (lev+1) cont ++ normalizeHeadings lev bs' + where (cont,bs') = break (headerLtEq l) bs + headerLtEq level (Header l' _ _) = l' <= level + headerLtEq _ _ = False + normalizeHeadings lev (b:bs) = b:normalizeHeadings lev bs + normalizeHeadings _ [] = [] + +-- | Return RST representation of reference key table. +refsToRST :: Refs -> State WriterState Doc +refsToRST refs = mapM keyToRST refs >>= return . vcat + +-- | Return RST representation of a reference key. +keyToRST :: ([Inline], (String, String)) + -> State WriterState Doc +keyToRST (label, (src, _)) = do + label' <- inlineListToRST label + let label'' = if ':' `elem` ((render Nothing label') :: String) + then char '`' <> label' <> char '`' + else label' + return $ nowrap $ ".. _" <> label'' <> ": " <> text src + +-- | Return RST representation of notes. +notesToRST :: [[Block]] -> State WriterState Doc +notesToRST notes = + mapM (\(num, note) -> noteToRST num note) (zip [1..] notes) >>= + return . vsep + +-- | Return RST representation of a note. +noteToRST :: Int -> [Block] -> State WriterState Doc +noteToRST num note = do + contents <- blockListToRST note + let marker = ".. [" <> text (show num) <> "]" + return $ nowrap $ marker $$ nest 3 contents + +-- | Return RST representation of picture reference table. +pictRefsToRST :: [([Inline], (Attr, String, String, Maybe String))] + -> State WriterState Doc +pictRefsToRST refs = mapM pictToRST refs >>= return . vcat + +-- | Return RST representation of a picture substitution reference. +pictToRST :: ([Inline], (Attr, String, String, Maybe String)) + -> State WriterState Doc +pictToRST (label, (attr, src, _, mbtarget)) = do + label' <- inlineListToRST label + dims <- imageDimsToRST attr + let (_, cls, _) = attr + classes = if null cls + then empty + else ":class: " <> text (unwords cls) + return $ nowrap + $ ".. |" <> label' <> "| image:: " <> text src $$ hang 3 empty (classes $$ dims) + $$ case mbtarget of + Nothing -> empty + Just t -> " :target: " <> text t + +-- | Escape special characters for RST. +escapeString :: WriterOptions -> String -> String +escapeString _ [] = [] +escapeString opts (c:cs) = + case c of + _ | c `elem` ['\\','`','*','_','|'] -> '\\':c:escapeString opts cs + '\'' | isEnabled Ext_smart opts -> '\\':'\'':escapeString opts cs + '"' | isEnabled Ext_smart opts -> '\\':'"':escapeString opts cs + '-' | isEnabled Ext_smart opts -> + case cs of + '-':_ -> '\\':'-':escapeString opts cs + _ -> '-':escapeString opts cs + '.' | isEnabled Ext_smart opts -> + case cs of + '.':'.':rest -> '\\':'.':'.':'.':escapeString opts rest + _ -> '.':escapeString opts cs + _ -> c : escapeString opts cs + +titleToRST :: [Inline] -> [Inline] -> State WriterState Doc +titleToRST [] _ = return empty +titleToRST tit subtit = do + title <- inlineListToRST tit + subtitle <- inlineListToRST subtit + return $ bordered title '=' $$ bordered subtitle '-' + +bordered :: Doc -> Char -> Doc +bordered contents c = + if len > 0 + then border $$ contents $$ border + else empty + where len = offset contents + border = text (replicate len c) + +-- | Convert Pandoc block element to RST. +blockToRST :: Block -- ^ Block element + -> State WriterState Doc +blockToRST Null = return empty +blockToRST (Div attr bs) = do + contents <- blockListToRST bs + let startTag = ".. raw:: html" $+$ nest 3 (tagWithAttrs "div" attr) + let endTag = ".. raw:: html" $+$ nest 3 "</div>" + return $ blankline <> startTag $+$ contents $+$ endTag $$ blankline +blockToRST (Plain inlines) = inlineListToRST inlines +-- title beginning with fig: indicates that the image is a figure +blockToRST (Para [Image attr txt (src,'f':'i':'g':':':tit)]) = do + capt <- inlineListToRST txt + dims <- imageDimsToRST attr + let fig = "figure:: " <> text src + alt = ":alt: " <> if null tit then capt else text tit + (_,cls,_) = attr + classes = if null cls + then empty + else ":figclass: " <> text (unwords cls) + return $ hang 3 ".. " (fig $$ alt $$ classes $$ dims $+$ capt) $$ blankline +blockToRST (Para inlines) + | LineBreak `elem` inlines = do -- use line block if LineBreaks + linesToLineBlock $ splitBy (==LineBreak) inlines + | otherwise = do + contents <- inlineListToRST inlines + return $ contents <> blankline +blockToRST (LineBlock lns) = + linesToLineBlock lns +blockToRST (RawBlock f@(Format f') str) + | f == "rst" = return $ text str + | otherwise = return $ blankline <> ".. raw:: " <> + text (map toLower f') $+$ + (nest 3 $ text str) $$ blankline +blockToRST HorizontalRule = + return $ blankline $$ "--------------" $$ blankline +blockToRST (Header level (name,classes,_) inlines) = do + contents <- inlineListToRST inlines + isTopLevel <- gets stTopLevel + if isTopLevel + then do + let headerChar = if level > 5 then ' ' else "=-~^'" !! (level - 1) + let border = text $ replicate (offset contents) headerChar + return $ nowrap $ contents $$ border $$ blankline + else do + let rub = "rubric:: " <> contents + let name' | null name = empty + | otherwise = ":name: " <> text name + let cls | null classes = empty + | otherwise = ":class: " <> text (unwords classes) + return $ nowrap $ hang 3 ".. " (rub $$ name' $$ cls) $$ blankline +blockToRST (CodeBlock (_,classes,kvs) str) = do + opts <- stOptions <$> get + let tabstop = writerTabStop opts + let startnum = maybe "" (\x -> " " <> text x) $ lookup "startFrom" kvs + let numberlines = if "numberLines" `elem` classes + then " :number-lines:" <> startnum + else empty + if "haskell" `elem` classes && "literate" `elem` classes && + isEnabled Ext_literate_haskell opts + then return $ prefixed "> " (text str) $$ blankline + else return $ + (case [c | c <- classes, + c `notElem` ["sourceCode","literate","numberLines"]] of + [] -> "::" + (lang:_) -> (".. code:: " <> text lang) $$ numberlines) + $+$ nest tabstop (text str) $$ blankline +blockToRST (BlockQuote blocks) = do + tabstop <- get >>= (return . writerTabStop . stOptions) + contents <- blockListToRST blocks + return $ (nest tabstop contents) <> blankline +blockToRST (Table caption _ widths headers rows) = do + caption' <- inlineListToRST caption + headers' <- mapM blockListToRST headers + rawRows <- mapM (mapM blockListToRST) rows + -- let isSimpleCell [Plain _] = True + -- isSimpleCell [Para _] = True + -- isSimpleCell [] = True + -- isSimpleCell _ = False + -- let isSimple = all (==0) widths && all (all isSimpleCell) rows + let numChars = maximum . map offset + opts <- get >>= return . stOptions + let widthsInChars = + if all (== 0) widths + then map ((+2) . numChars) $ transpose (headers' : rawRows) + else map (floor . (fromIntegral (writerColumns opts) *)) widths + let hpipeBlocks blocks = hcat [beg, middle, end] + where h = height (hcat blocks) + sep' = lblock 3 $ vcat (map text $ replicate h " | ") + beg = lblock 2 $ vcat (map text $ replicate h "| ") + end = lblock 2 $ vcat (map text $ replicate h " |") + middle = hcat $ intersperse sep' blocks + let makeRow = hpipeBlocks . zipWith lblock widthsInChars + let head' = makeRow headers' + let rows' = map makeRow rawRows + let border ch = char '+' <> char ch <> + (hcat $ intersperse (char ch <> char '+' <> char ch) $ + map (\l -> text $ replicate l ch) widthsInChars) <> + char ch <> char '+' + let body = vcat $ intersperse (border '-') rows' + let head'' = if all null headers + then empty + else head' $$ border '=' + let tbl = border '-' $$ head'' $$ body $$ border '-' + return $ if null caption + then tbl $$ blankline + else (".. table:: " <> caption') $$ blankline $$ nest 3 tbl $$ + blankline +blockToRST (BulletList items) = do + contents <- mapM bulletListItemToRST items + -- ensure that sublists have preceding blank line + return $ blankline $$ chomp (vcat contents) $$ blankline +blockToRST (OrderedList (start, style', delim) items) = do + let markers = if start == 1 && style' == DefaultStyle && delim == DefaultDelim + then take (length items) $ repeat "#." + else take (length items) $ orderedListMarkers + (start, style', delim) + let maxMarkerLength = maximum $ map length markers + let markers' = map (\m -> let s = maxMarkerLength - length m + in m ++ replicate s ' ') markers + contents <- mapM (\(item, num) -> orderedListItemToRST item num) $ + zip markers' items + -- ensure that sublists have preceding blank line + return $ blankline $$ chomp (vcat contents) $$ blankline +blockToRST (DefinitionList items) = do + contents <- mapM definitionListItemToRST items + -- ensure that sublists have preceding blank line + return $ blankline $$ chomp (vcat contents) $$ blankline + +-- | Convert bullet list item (list of blocks) to RST. +bulletListItemToRST :: [Block] -> State WriterState Doc +bulletListItemToRST items = do + contents <- blockListToRST items + return $ hang 3 "- " $ contents <> cr + +-- | Convert ordered list item (a list of blocks) to RST. +orderedListItemToRST :: String -- ^ marker for list item + -> [Block] -- ^ list item (list of blocks) + -> State WriterState Doc +orderedListItemToRST marker items = do + contents <- blockListToRST items + let marker' = marker ++ " " + return $ hang (length marker') (text marker') $ contents <> cr + +-- | Convert defintion list item (label, list of blocks) to RST. +definitionListItemToRST :: ([Inline], [[Block]]) -> State WriterState Doc +definitionListItemToRST (label, defs) = do + label' <- inlineListToRST label + contents <- liftM vcat $ mapM blockListToRST defs + tabstop <- get >>= (return . writerTabStop . stOptions) + return $ label' $$ nest tabstop (nestle contents <> cr) + +-- | Format a list of lines as line block. +linesToLineBlock :: [[Inline]] -> State WriterState Doc +linesToLineBlock inlineLines = do + lns <- mapM inlineListToRST inlineLines + return $ (vcat $ map (hang 2 (text "| ")) lns) <> blankline + +-- | Convert list of Pandoc block elements to RST. +blockListToRST' :: Bool + -> [Block] -- ^ List of block elements + -> State WriterState Doc +blockListToRST' topLevel blocks = do + tl <- gets stTopLevel + modify (\s->s{stTopLevel=topLevel}) + res <- vcat `fmap` mapM blockToRST blocks + modify (\s->s{stTopLevel=tl}) + return res + +blockListToRST :: [Block] -- ^ List of block elements + -> State WriterState Doc +blockListToRST = blockListToRST' False + +-- | Convert list of Pandoc inline elements to RST. +inlineListToRST :: [Inline] -> State WriterState Doc +inlineListToRST lst = + mapM inlineToRST (removeSpaceAfterDisplayMath $ insertBS lst) >>= + return . hcat + where -- remove spaces after displaymath, as they screw up indentation: + removeSpaceAfterDisplayMath (Math DisplayMath x : zs) = + Math DisplayMath x : dropWhile (==Space) zs + removeSpaceAfterDisplayMath (x:xs) = x : removeSpaceAfterDisplayMath xs + removeSpaceAfterDisplayMath [] = [] + insertBS :: [Inline] -> [Inline] -- insert '\ ' where needed + insertBS (x:y:z:zs) + | isComplex y && (surroundComplex x z) = + x : y : insertBS (z : zs) + insertBS (x:y:zs) + | isComplex x && not (okAfterComplex y) = + x : RawInline "rst" "\\ " : insertBS (y : zs) + | isComplex y && not (okBeforeComplex x) = + x : RawInline "rst" "\\ " : insertBS (y : zs) + | otherwise = + x : insertBS (y : zs) + insertBS (x:ys) = x : insertBS ys + insertBS [] = [] + surroundComplex :: Inline -> Inline -> Bool + surroundComplex (Str s@(_:_)) (Str s'@(_:_)) = + case (last s, head s') of + ('\'','\'') -> True + ('"','"') -> True + ('<','>') -> True + ('[',']') -> True + ('{','}') -> True + _ -> False + surroundComplex _ _ = False + okAfterComplex :: Inline -> Bool + okAfterComplex Space = True + okAfterComplex SoftBreak = True + okAfterComplex LineBreak = True + okAfterComplex (Str (c:_)) = isSpace c || c `elem` ("-.,:;!?\\/'\")]}>–—" :: String) + okAfterComplex _ = False + okBeforeComplex :: Inline -> Bool + okBeforeComplex Space = True + okBeforeComplex SoftBreak = True + okBeforeComplex LineBreak = True + okBeforeComplex (Str (c:_)) = isSpace c || c `elem` ("-:/'\"<([{–—" :: String) + okBeforeComplex _ = False + isComplex :: Inline -> Bool + isComplex (Emph _) = True + isComplex (Strong _) = True + isComplex (SmallCaps _) = True + isComplex (Strikeout _) = True + isComplex (Superscript _) = True + isComplex (Subscript _) = True + isComplex (Link _ _ _) = True + isComplex (Image _ _ _) = True + isComplex (Code _ _) = True + isComplex (Math _ _) = True + isComplex (Cite _ (x:_)) = isComplex x + isComplex (Span _ (x:_)) = isComplex x + isComplex _ = False + +-- | Convert Pandoc inline element to RST. +inlineToRST :: Inline -> State WriterState Doc +inlineToRST (Span _ ils) = inlineListToRST ils +inlineToRST (Emph lst) = do + contents <- inlineListToRST lst + return $ "*" <> contents <> "*" +inlineToRST (Strong lst) = do + contents <- inlineListToRST lst + return $ "**" <> contents <> "**" +inlineToRST (Strikeout lst) = do + contents <- inlineListToRST lst + return $ "[STRIKEOUT:" <> contents <> "]" +inlineToRST (Superscript lst) = do + contents <- inlineListToRST lst + return $ ":sup:`" <> contents <> "`" +inlineToRST (Subscript lst) = do + contents <- inlineListToRST lst + return $ ":sub:`" <> contents <> "`" +inlineToRST (SmallCaps lst) = inlineListToRST lst +inlineToRST (Quoted SingleQuote lst) = do + contents <- inlineListToRST lst + opts <- gets stOptions + if isEnabled Ext_smart opts + then return $ "'" <> contents <> "'" + else return $ "‘" <> contents <> "’" +inlineToRST (Quoted DoubleQuote lst) = do + contents <- inlineListToRST lst + opts <- gets stOptions + if isEnabled Ext_smart opts + then return $ "\"" <> contents <> "\"" + else return $ "“" <> contents <> "”" +inlineToRST (Cite _ lst) = + inlineListToRST lst +inlineToRST (Code _ str) = return $ "``" <> text str <> "``" +inlineToRST (Str str) = do + opts <- gets stOptions + return $ text $ + (if isEnabled Ext_smart opts + then unsmartify opts + else id) $ escapeString opts str +inlineToRST (Math t str) = do + modify $ \st -> st{ stHasMath = True } + return $ if t == InlineMath + then ":math:`" <> text str <> "`" + else if '\n' `elem` str + then blankline $$ ".. math::" $$ + blankline $$ nest 3 (text str) $$ blankline + else blankline $$ (".. math:: " <> text str) $$ blankline +inlineToRST (RawInline f x) + | f == "rst" = return $ text x + | f == "latex" || f == "tex" = do + modify $ \st -> st{ stHasRawTeX = True } + return $ ":raw-latex:`" <> text x <> "`" + | otherwise = return empty +inlineToRST (LineBreak) = return cr -- there's no line break in RST (see Para) +inlineToRST Space = return space +inlineToRST SoftBreak = do + wrapText <- gets (writerWrapText . stOptions) + case wrapText of + WrapPreserve -> return cr + WrapAuto -> return space + WrapNone -> return space +-- autolink +inlineToRST (Link _ [Str str] (src, _)) + | isURI src && + if "mailto:" `isPrefixOf` src + then src == escapeURI ("mailto:" ++ str) + else src == escapeURI str = do + let srcSuffix = fromMaybe src (stripPrefix "mailto:" src) + return $ text srcSuffix +inlineToRST (Link _ [Image attr alt (imgsrc,imgtit)] (src, _tit)) = do + label <- registerImage attr alt (imgsrc,imgtit) (Just src) + return $ "|" <> label <> "|" +inlineToRST (Link _ txt (src, tit)) = do + useReferenceLinks <- get >>= return . writerReferenceLinks . stOptions + linktext <- inlineListToRST $ normalizeSpaces txt + if useReferenceLinks + then do refs <- get >>= return . stLinks + case lookup txt refs of + Just (src',tit') -> + if src == src' && tit == tit' + then return $ "`" <> linktext <> "`_" + else do -- duplicate label, use non-reference link + return $ "`" <> linktext <> " <" <> text src <> ">`__" + Nothing -> do + modify $ \st -> st { stLinks = (txt,(src,tit)):refs } + return $ "`" <> linktext <> "`_" + else return $ "`" <> linktext <> " <" <> text src <> ">`__" +inlineToRST (Image attr alternate (source, tit)) = do + label <- registerImage attr alternate (source,tit) Nothing + return $ "|" <> label <> "|" +inlineToRST (Note contents) = do + -- add to notes in state + notes <- gets stNotes + modify $ \st -> st { stNotes = contents:notes } + let ref = show $ (length notes) + 1 + return $ " [" <> text ref <> "]_" + +registerImage :: Attr -> [Inline] -> Target -> Maybe String -> State WriterState Doc +registerImage attr alt (src,tit) mbtarget = do + pics <- get >>= return . stImages + txt <- case lookup alt pics of + Just (a,s,t,mbt) | (a,s,t,mbt) == (attr,src,tit,mbtarget) + -> return alt + _ -> do + let alt' = if null alt || alt == [Str ""] + then [Str $ "image" ++ show (length pics)] + else alt + modify $ \st -> st { stImages = + (alt', (attr,src,tit, mbtarget)):stImages st } + return alt' + inlineListToRST txt + +imageDimsToRST :: Attr -> State WriterState Doc +imageDimsToRST attr = do + let (ident, _, _) = attr + name = if null ident + then empty + else ":name: " <> text ident + showDim dir = let cols d = ":" <> text (show dir) <> ": " <> text (show d) + in case (dimension dir attr) of + Just (Percent a) -> + case dir of + Height -> empty + Width -> cols (Percent a) + Just dim -> cols dim + Nothing -> empty + return $ cr <> name $$ showDim Width $$ showDim Height diff --git a/src/Text/Pandoc/Writers/RTF.hs b/src/Text/Pandoc/Writers/RTF.hs new file mode 100644 index 000000000..ef012e58e --- /dev/null +++ b/src/Text/Pandoc/Writers/RTF.hs @@ -0,0 +1,412 @@ +{-# LANGUAGE ScopedTypeVariables #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.RTF + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to RTF (rich text format). +-} +module Text.Pandoc.Writers.RTF ( writeRTF + ) where +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Writers.Math +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.Walk +import Text.Pandoc.Logging +import Data.List ( isSuffixOf, intercalate ) +import Data.Char ( ord, chr, isDigit ) +import qualified Data.ByteString as B +import qualified Data.Map as M +import Text.Printf ( printf ) +import Text.Pandoc.ImageSize +import Control.Monad.Except (throwError, runExceptT, lift) +import Text.Pandoc.Error +import Text.Pandoc.Class (PandocMonad, report) +import qualified Text.Pandoc.Class as P + +-- | Convert Image inlines into a raw RTF embedded image, read from a file, +-- or a MediaBag, or the internet. +-- If file not found or filetype not jpeg or png, leave the inline unchanged. +rtfEmbedImage :: PandocMonad m => WriterOptions -> Inline -> m Inline +rtfEmbedImage opts x@(Image attr _ (src,_)) = do + result <- runExceptT $ lift $ P.fetchItem (writerSourceURL opts) src + case result of + Right (imgdata, Just mime) + | mime == "image/jpeg" || mime == "image/png" -> do + let bytes = map (printf "%02x") $ B.unpack imgdata + filetype <- case mime of + "image/jpeg" -> return "\\jpegblip" + "image/png" -> return "\\pngblip" + _ -> throwError $ PandocSomeError "Unknown file type" + sizeSpec <- case imageSize imgdata of + Left msg -> do + report $ CouldNotDetermineImageSize src msg + return "" + Right sz -> return $ "\\picw" ++ show xpx ++ + "\\pich" ++ show ypx ++ + "\\picwgoal" ++ show (floor (xpt * 20) :: Integer) + ++ "\\pichgoal" ++ show (floor (ypt * 20) :: Integer) + -- twip = 1/1440in = 1/20pt + where (xpx, ypx) = sizeInPixels sz + (xpt, ypt) = desiredSizeInPoints opts attr sz + let raw = "{\\pict" ++ filetype ++ sizeSpec ++ "\\bin " ++ + concat bytes ++ "}" + if B.null imgdata + then do + report $ CouldNotFetchResource src "image contained no data" + return x + else return $ RawInline (Format "rtf") raw + | otherwise -> do + report $ CouldNotFetchResource src "image is not a jpeg or png" + return x + Right (_, Nothing) -> do + report $ CouldNotDetermineMimeType src + return x + Left ( e :: PandocError ) -> do + report $ CouldNotFetchResource src (show e) + return x +rtfEmbedImage _ x = return x + +-- | Convert Pandoc to a string in rich text format. +writeRTF :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeRTF options doc = do + -- handle images + Pandoc meta@(Meta metamap) blocks <- walkM (rtfEmbedImage options) doc + let spacer = not $ all null $ docTitle meta : docDate meta : docAuthors meta + let toPlain (MetaBlocks [Para ils]) = MetaInlines ils + toPlain x = x + -- adjust title, author, date so we don't get para inside para + let meta' = Meta $ M.adjust toPlain "title" + . M.adjust toPlain "author" + . M.adjust toPlain "date" + $ metamap + metadata <- metaToJSON options + (fmap concat . mapM (blockToRTF 0 AlignDefault)) + (inlinesToRTF) + meta' + body <- blocksToRTF 0 AlignDefault blocks + let isTOCHeader (Header lev _ _) = lev <= writerTOCDepth options + isTOCHeader _ = False + toc <- tableOfContents $ filter isTOCHeader blocks + let context = defField "body" body + $ defField "spacer" spacer + $ (if writerTableOfContents options + then defField "toc" toc + else id) + $ metadata + return $ case writerTemplate options of + Just tpl -> renderTemplate' tpl context + Nothing -> case reverse body of + ('\n':_) -> body + _ -> body ++ "\n" + +-- | Construct table of contents from list of header blocks. +tableOfContents :: PandocMonad m => [Block] -> m String +tableOfContents headers = do + let contents = map elementToListItem $ hierarchicalize headers + blocksToRTF 0 AlignDefault $ + [Header 1 nullAttr [Str "Contents"], BulletList contents] + +elementToListItem :: Element -> [Block] +elementToListItem (Blk _) = [] +elementToListItem (Sec _ _ _ sectext subsecs) = [Plain sectext] ++ + if null subsecs + then [] + else [BulletList (map elementToListItem subsecs)] + +-- | Convert unicode characters (> 127) into rich text format representation. +handleUnicode :: String -> String +handleUnicode [] = [] +handleUnicode (c:cs) = + if ord c > 127 + then if surrogate c + then let x = ord c - 0x10000 + (q, r) = x `divMod` 0x400 + upper = q + 0xd800 + lower = r + 0xDC00 + in enc (chr upper) ++ enc (chr lower) ++ handleUnicode cs + else enc c ++ handleUnicode cs + else c:(handleUnicode cs) + where + surrogate x = not ( (0x0000 <= ord x && ord x <= 0xd7ff) + || (0xe000 <= ord x && ord x <= 0xffff) ) + enc x = '\\':'u':(show (ord x)) ++ "?" + +-- | Escape special characters. +escapeSpecial :: String -> String +escapeSpecial = escapeStringUsing $ + [ ('\t',"\\tab ") + , ('\8216',"\\u8216'") + , ('\8217',"\\u8217'") + , ('\8220',"\\u8220\"") + , ('\8221',"\\u8221\"") + , ('\8211',"\\u8211-") + , ('\8212',"\\u8212-") + ] ++ backslashEscapes "{\\}" + +-- | Escape strings as needed for rich text format. +stringToRTF :: String -> String +stringToRTF = handleUnicode . escapeSpecial + +-- | Escape things as needed for code block in RTF. +codeStringToRTF :: String -> String +codeStringToRTF str = intercalate "\\line\n" $ lines (stringToRTF str) + +-- | Make a paragraph with first-line indent, block indent, and space after. +rtfParSpaced :: Int -- ^ space after (in twips) + -> Int -- ^ block indent (in twips) + -> Int -- ^ first line indent (relative to block) (in twips) + -> Alignment -- ^ alignment + -> String -- ^ string with content + -> String +rtfParSpaced spaceAfter indent firstLineIndent alignment content = + let alignString = case alignment of + AlignLeft -> "\\ql " + AlignRight -> "\\qr " + AlignCenter -> "\\qc " + AlignDefault -> "\\ql " + in "{\\pard " ++ alignString ++ + "\\f0 \\sa" ++ (show spaceAfter) ++ " \\li" ++ (show indent) ++ + " \\fi" ++ (show firstLineIndent) ++ " " ++ content ++ "\\par}\n" + +-- | Default paragraph. +rtfPar :: Int -- ^ block indent (in twips) + -> Int -- ^ first line indent (relative to block) (in twips) + -> Alignment -- ^ alignment + -> String -- ^ string with content + -> String +rtfPar = rtfParSpaced 180 + +-- | Compact paragraph (e.g. for compact list items). +rtfCompact :: Int -- ^ block indent (in twips) + -> Int -- ^ first line indent (relative to block) (in twips) + -> Alignment -- ^ alignment + -> String -- ^ string with content + -> String +rtfCompact = rtfParSpaced 0 + +-- number of twips to indent +indentIncrement :: Int +indentIncrement = 720 + +listIncrement :: Int +listIncrement = 360 + +-- | Returns appropriate bullet list marker for indent level. +bulletMarker :: Int -> String +bulletMarker indent = case indent `mod` 720 of + 0 -> "\\bullet " + _ -> "\\endash " + +-- | Returns appropriate (list of) ordered list markers for indent level. +orderedMarkers :: Int -> ListAttributes -> [String] +orderedMarkers indent (start, style, delim) = + if style == DefaultStyle && delim == DefaultDelim + then case indent `mod` 720 of + 0 -> orderedListMarkers (start, Decimal, Period) + _ -> orderedListMarkers (start, LowerAlpha, Period) + else orderedListMarkers (start, style, delim) + +blocksToRTF :: PandocMonad m + => Int + -> Alignment + -> [Block] + -> m String +blocksToRTF indent align = fmap concat . mapM (blockToRTF indent align) + +-- | Convert Pandoc block element to RTF. +blockToRTF :: PandocMonad m + => Int -- ^ indent level + -> Alignment -- ^ alignment + -> Block -- ^ block to convert + -> m String +blockToRTF _ _ Null = return "" +blockToRTF indent alignment (Div _ bs) = + blocksToRTF indent alignment bs +blockToRTF indent alignment (Plain lst) = + rtfCompact indent 0 alignment <$> inlinesToRTF lst +blockToRTF indent alignment (Para lst) = + rtfPar indent 0 alignment <$> inlinesToRTF lst +blockToRTF indent alignment (LineBlock lns) = + blockToRTF indent alignment $ linesToPara lns +blockToRTF indent alignment (BlockQuote lst) = + blocksToRTF (indent + indentIncrement) alignment lst +blockToRTF indent _ (CodeBlock _ str) = + return $ rtfPar indent 0 AlignLeft ("\\f1 " ++ (codeStringToRTF str)) +blockToRTF _ _ b@(RawBlock f str) + | f == Format "rtf" = return str + | otherwise = do + report $ BlockNotRendered b + return "" +blockToRTF indent alignment (BulletList lst) = (spaceAtEnd . concat) <$> + mapM (listItemToRTF alignment indent (bulletMarker indent)) lst +blockToRTF indent alignment (OrderedList attribs lst) = + (spaceAtEnd . concat) <$> + mapM (\(x,y) -> listItemToRTF alignment indent x y) + (zip (orderedMarkers indent attribs) lst) +blockToRTF indent alignment (DefinitionList lst) = (spaceAtEnd . concat) <$> + mapM (definitionListItemToRTF alignment indent) lst +blockToRTF indent _ HorizontalRule = return $ + rtfPar indent 0 AlignCenter "\\emdash\\emdash\\emdash\\emdash\\emdash" +blockToRTF indent alignment (Header level _ lst) = do + contents <- inlinesToRTF lst + return $ rtfPar indent 0 alignment $ + "\\b \\fs" ++ (show (40 - (level * 4))) ++ " " ++ contents +blockToRTF indent alignment (Table caption aligns sizes headers rows) = do + caption' <- inlinesToRTF caption + header' <- if all null headers + then return "" + else tableRowToRTF True indent aligns sizes headers + rows' <- concat <$> mapM (tableRowToRTF False indent aligns sizes) rows + return $ header' ++ rows' ++ rtfPar indent 0 alignment caption' + +tableRowToRTF :: PandocMonad m + => Bool -> Int -> [Alignment] -> [Double] -> [[Block]] -> m String +tableRowToRTF header indent aligns sizes' cols = do + let totalTwips = 6 * 1440 -- 6 inches + let sizes = if all (== 0) sizes' + then take (length cols) $ repeat (1.0 / fromIntegral (length cols)) + else sizes' + columns <- concat <$> mapM (\(x,y) -> tableItemToRTF indent x y) + (zip aligns cols) + let rightEdges = tail $ scanl (\sofar new -> sofar + floor (new * totalTwips)) + (0 :: Integer) sizes + let cellDefs = map (\edge -> (if header + then "\\clbrdrb\\brdrs" + else "") ++ "\\cellx" ++ show edge) + rightEdges + let start = "{\n\\trowd \\trgaph120\n" ++ concat cellDefs ++ "\n" ++ + "\\trkeep\\intbl\n{\n" + let end = "}\n\\intbl\\row}\n" + return $ start ++ columns ++ end + +tableItemToRTF :: PandocMonad m => Int -> Alignment -> [Block] -> m String +tableItemToRTF indent alignment item = do + contents <- blocksToRTF indent alignment item + return $ "{" ++ substitute "\\pard" "\\pard\\intbl" contents ++ "\\cell}\n" + +-- | Ensure that there's the same amount of space after compact +-- lists as after regular lists. +spaceAtEnd :: String -> String +spaceAtEnd str = + if isSuffixOf "\\par}\n" str + then (take ((length str) - 6) str) ++ "\\sa180\\par}\n" + else str + +-- | Convert list item (list of blocks) to RTF. +listItemToRTF :: PandocMonad m + => Alignment -- ^ alignment + -> Int -- ^ indent level + -> String -- ^ list start marker + -> [Block] -- ^ list item (list of blocks) + -> m String +listItemToRTF alignment indent marker [] = return $ + rtfCompact (indent + listIncrement) (0 - listIncrement) alignment + (marker ++ "\\tx" ++ (show listIncrement) ++ "\\tab ") +listItemToRTF alignment indent marker list = do + (first:rest) <- mapM (blockToRTF (indent + listIncrement) alignment) list + let listMarker = "\\fi" ++ show (0 - listIncrement) ++ " " ++ marker ++ + "\\tx" ++ show listIncrement ++ "\\tab" + let insertListMarker ('\\':'f':'i':'-':d:xs) | isDigit d = + listMarker ++ dropWhile isDigit xs + insertListMarker ('\\':'f':'i':d:xs) | isDigit d = + listMarker ++ dropWhile isDigit xs + insertListMarker (x:xs) = + x : insertListMarker xs + insertListMarker [] = [] + -- insert the list marker into the (processed) first block + return $ insertListMarker first ++ concat rest + +-- | Convert definition list item (label, list of blocks) to RTF. +definitionListItemToRTF :: PandocMonad m + => Alignment -- ^ alignment + -> Int -- ^ indent level + -> ([Inline],[[Block]]) -- ^ list item (list of blocks) + -> m String +definitionListItemToRTF alignment indent (label, defs) = do + labelText <- blockToRTF indent alignment (Plain label) + itemsText <- blocksToRTF (indent + listIncrement) alignment (concat defs) + return $ labelText ++ itemsText + +-- | Convert list of inline items to RTF. +inlinesToRTF :: PandocMonad m + => [Inline] -- ^ list of inlines to convert + -> m String +inlinesToRTF lst = concat <$> mapM inlineToRTF lst + +-- | Convert inline item to RTF. +inlineToRTF :: PandocMonad m + => Inline -- ^ inline to convert + -> m String +inlineToRTF (Span _ lst) = inlinesToRTF lst +inlineToRTF (Emph lst) = do + contents <- inlinesToRTF lst + return $ "{\\i " ++ contents ++ "}" +inlineToRTF (Strong lst) = do + contents <- inlinesToRTF lst + return $ "{\\b " ++ contents ++ "}" +inlineToRTF (Strikeout lst) = do + contents <- inlinesToRTF lst + return $ "{\\strike " ++ contents ++ "}" +inlineToRTF (Superscript lst) = do + contents <- inlinesToRTF lst + return $ "{\\super " ++ contents ++ "}" +inlineToRTF (Subscript lst) = do + contents <- inlinesToRTF lst + return $ "{\\sub " ++ contents ++ "}" +inlineToRTF (SmallCaps lst) = do + contents <- inlinesToRTF lst + return $ "{\\scaps " ++ contents ++ "}" +inlineToRTF (Quoted SingleQuote lst) = do + contents <- inlinesToRTF lst + return $ "\\u8216'" ++ contents ++ "\\u8217'" +inlineToRTF (Quoted DoubleQuote lst) = do + contents <- inlinesToRTF lst + return $ "\\u8220\"" ++ contents ++ "\\u8221\"" +inlineToRTF (Code _ str) = return $ "{\\f1 " ++ (codeStringToRTF str) ++ "}" +inlineToRTF (Str str) = return $ stringToRTF str +inlineToRTF (Math t str) = texMathToInlines t str >>= inlinesToRTF +inlineToRTF (Cite _ lst) = inlinesToRTF lst +inlineToRTF il@(RawInline f str) + | f == Format "rtf" = return str + | otherwise = do + return $ InlineNotRendered il + return "" +inlineToRTF (LineBreak) = return "\\line " +inlineToRTF SoftBreak = return " " +inlineToRTF Space = return " " +inlineToRTF (Link _ text (src, _)) = do + contents <- inlinesToRTF text + return $ "{\\field{\\*\\fldinst{HYPERLINK \"" ++ (codeStringToRTF src) ++ + "\"}}{\\fldrslt{\\ul\n" ++ contents ++ "\n}}}\n" +inlineToRTF (Image _ _ (source, _)) = + return $ "{\\cf1 [image: " ++ source ++ "]\\cf0}" +inlineToRTF (Note contents) = do + body <- concat <$> mapM (blockToRTF 0 AlignDefault) contents + return $ "{\\super\\chftn}{\\*\\footnote\\chftn\\~\\plain\\pard " ++ + body ++ "}" diff --git a/src/Text/Pandoc/Writers/Shared.hs b/src/Text/Pandoc/Writers/Shared.hs new file mode 100644 index 000000000..89a826269 --- /dev/null +++ b/src/Text/Pandoc/Writers/Shared.hs @@ -0,0 +1,183 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2013-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Shared + Copyright : Copyright (C) 2013-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Shared utility functions for pandoc writers. +-} +module Text.Pandoc.Writers.Shared ( + metaToJSON + , getField + , setField + , defField + , tagWithAttrs + , fixDisplayMath + , unsmartify + ) +where +import Text.Pandoc.Definition +import Text.Pandoc.Pretty +import Text.Pandoc.Options +import Text.Pandoc.XML (escapeStringForXML) +import Control.Monad (liftM) +import qualified Data.HashMap.Strict as H +import qualified Data.Map as M +import qualified Data.Text as T +import Data.Aeson (FromJSON(..), fromJSON, ToJSON (..), Value(Object), Result(..), encode) +import Text.Pandoc.UTF8 (toStringLazy) +import qualified Data.Traversable as Traversable +import Data.List ( groupBy ) +import Data.Maybe ( isJust ) + +-- | Create JSON value for template from a 'Meta' and an association list +-- of variables, specified at the command line or in the writer. +-- Variables overwrite metadata fields with the same names. +-- If multiple variables are set with the same name, a list is +-- assigned. +metaToJSON :: Monad m + => WriterOptions + -> ([Block] -> m String) + -> ([Inline] -> m String) + -> Meta + -> m Value +metaToJSON opts blockWriter inlineWriter (Meta metamap) + | isJust (writerTemplate opts) = do + let baseContext = foldl (\acc (x,y) -> setField x y acc) (Object H.empty) + $ writerVariables opts + renderedMap <- Traversable.mapM + (metaValueToJSON blockWriter inlineWriter) + metamap + let metadata = M.foldWithKey defField baseContext renderedMap + return $ defField "meta-json" (toStringLazy $ encode metadata) metadata + | otherwise = return (Object H.empty) + +metaValueToJSON :: Monad m + => ([Block] -> m String) + -> ([Inline] -> m String) + -> MetaValue + -> m Value +metaValueToJSON blockWriter inlineWriter (MetaMap metamap) = liftM toJSON $ + Traversable.mapM (metaValueToJSON blockWriter inlineWriter) metamap +metaValueToJSON blockWriter inlineWriter (MetaList xs) = liftM toJSON $ + Traversable.mapM (metaValueToJSON blockWriter inlineWriter) xs +metaValueToJSON _ _ (MetaBool b) = return $ toJSON b +metaValueToJSON _ _ (MetaString s) = return $ toJSON s +metaValueToJSON blockWriter _ (MetaBlocks bs) = liftM toJSON $ blockWriter bs +metaValueToJSON _ inlineWriter (MetaInlines bs) = liftM toJSON $ inlineWriter bs + +-- | Retrieve a field value from a JSON object. +getField :: FromJSON a + => String + -> Value + -> Maybe a +getField field (Object hashmap) = do + result <- H.lookup (T.pack field) hashmap + case fromJSON result of + Success x -> return x + _ -> fail "Could not convert from JSON" +getField _ _ = fail "Not a JSON object" + +setField :: ToJSON a + => String + -> a + -> Value + -> Value +-- | Set a field of a JSON object. If the field already has a value, +-- convert it into a list with the new value appended to the old value(s). +-- This is a utility function to be used in preparing template contexts. +setField field val (Object hashmap) = + Object $ H.insertWith combine (T.pack field) (toJSON val) hashmap + where combine newval oldval = + case fromJSON oldval of + Success xs -> toJSON $ xs ++ [newval] + _ -> toJSON [oldval, newval] +setField _ _ x = x + +defField :: ToJSON a + => String + -> a + -> Value + -> Value +-- | Set a field of a JSON object if it currently has no value. +-- If it has a value, do nothing. +-- This is a utility function to be used in preparing template contexts. +defField field val (Object hashmap) = + Object $ H.insertWith f (T.pack field) (toJSON val) hashmap + where f _newval oldval = oldval +defField _ _ x = x + +-- Produce an HTML tag with the given pandoc attributes. +tagWithAttrs :: String -> Attr -> Doc +tagWithAttrs tag (ident,classes,kvs) = hsep + ["<" <> text tag + ,if null ident + then empty + else "id=" <> doubleQuotes (text ident) + ,if null classes + then empty + else "class=" <> doubleQuotes (text (unwords classes)) + ,hsep (map (\(k,v) -> text k <> "=" <> + doubleQuotes (text (escapeStringForXML v))) kvs) + ] <> ">" + +isDisplayMath :: Inline -> Bool +isDisplayMath (Math DisplayMath _) = True +isDisplayMath _ = False + +stripLeadingTrailingSpace :: [Inline] -> [Inline] +stripLeadingTrailingSpace = go . reverse . go . reverse + where go (Space:xs) = xs + go (SoftBreak:xs) = xs + go xs = xs + +-- Put display math in its own block (for ODT/DOCX). +fixDisplayMath :: Block -> Block +fixDisplayMath (Plain lst) + | any isDisplayMath lst && not (all isDisplayMath lst) = + -- chop into several paragraphs so each displaymath is its own + Div ("",["math"],[]) $ map (Plain . stripLeadingTrailingSpace) $ + groupBy (\x y -> (isDisplayMath x && isDisplayMath y) || + not (isDisplayMath x || isDisplayMath y)) lst +fixDisplayMath (Para lst) + | any isDisplayMath lst && not (all isDisplayMath lst) = + -- chop into several paragraphs so each displaymath is its own + Div ("",["math"],[]) $ map (Para . stripLeadingTrailingSpace) $ + groupBy (\x y -> (isDisplayMath x && isDisplayMath y) || + not (isDisplayMath x || isDisplayMath y)) lst +fixDisplayMath x = x + +unsmartify :: WriterOptions -> String -> String +unsmartify opts ('\8217':xs) = '\'' : unsmartify opts xs +unsmartify opts ('\8230':xs) = "..." ++ unsmartify opts xs +unsmartify opts ('\8211':xs) + | isEnabled Ext_old_dashes opts = '-' : unsmartify opts xs + | otherwise = "--" ++ unsmartify opts xs +unsmartify opts ('\8212':xs) + | isEnabled Ext_old_dashes opts = "--" ++ unsmartify opts xs + | otherwise = "---" ++ unsmartify opts xs +unsmartify opts (x:xs) = x : unsmartify opts xs +unsmartify _ [] = [] + diff --git a/src/Text/Pandoc/Writers/TEI.hs b/src/Text/Pandoc/Writers/TEI.hs new file mode 100644 index 000000000..a54d42c53 --- /dev/null +++ b/src/Text/Pandoc/Writers/TEI.hs @@ -0,0 +1,324 @@ +{-# LANGUAGE OverloadedStrings, PatternGuards #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Docbook + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to Docbook XML. +-} +module Text.Pandoc.Writers.TEI (writeTEI) where +import Text.Pandoc.Definition +import Text.Pandoc.XML +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import Text.Pandoc.Templates (renderTemplate') +import Data.List ( stripPrefix, isPrefixOf ) +import Data.Char ( toLower ) +import Text.Pandoc.Highlighting ( languages, languagesByExtension ) +import Text.Pandoc.Pretty +import Text.Pandoc.ImageSize +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class ( PandocMonad ) + +-- | Convert list of authors to a docbook <author> section +authorToTEI :: WriterOptions -> [Inline] -> B.Inlines +authorToTEI opts name' = + let name = render Nothing $ inlinesToTEI opts name' + colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + in B.rawInline "tei" $ render colwidth $ + inTagsSimple "author" (text $ escapeStringForXML name) + +-- | Convert Pandoc document to string in Docbook format. +writeTEI :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeTEI opts (Pandoc meta blocks) = return $ + let elements = hierarchicalize blocks + colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + render' = render colwidth + startLvl = case writerTopLevelDivision opts of + TopLevelPart -> -1 + TopLevelChapter -> 0 + TopLevelSection -> 1 + TopLevelDefault -> 1 + auths' = map (authorToTEI opts) $ docAuthors meta + meta' = B.setMeta "author" auths' meta + Just metadata = metaToJSON opts + (Just . render colwidth . (vcat . + (map (elementToTEI opts startLvl)) . hierarchicalize)) + (Just . render colwidth . inlinesToTEI opts) + meta' + main = render' $ vcat (map (elementToTEI opts startLvl) elements) + context = defField "body" main + $ defField "mathml" (case writerHTMLMathMethod opts of + MathML -> True + _ -> False) + $ metadata + in case writerTemplate opts of + Nothing -> main + Just tpl -> renderTemplate' tpl context + +-- | Convert an Element to TEI. +elementToTEI :: WriterOptions -> Int -> Element -> Doc +elementToTEI opts _ (Blk block) = blockToTEI opts block +elementToTEI opts lvl (Sec _ _num (id',_,_) title elements) = + -- TEI doesn't allow sections with no content, so insert some if needed + let elements' = if null elements + then [Blk (Para [])] + else elements + -- level numbering correspond to LaTeX internals + divType = case lvl of + n | n == -1 -> "part" + | n == 0 -> "chapter" + | n >= 1 && n <= 5 -> "level" ++ show n + | otherwise -> "section" + in inTags True "div" [("type", divType) | not (null id')] $ +-- ("id", writerIdentifierPrefix opts ++ id') | not (null id')] $ + inTagsSimple "head" (inlinesToTEI opts title) $$ + vcat (map (elementToTEI opts (lvl + 1)) elements') + +-- | Convert a list of Pandoc blocks to TEI. +blocksToTEI :: WriterOptions -> [Block] -> Doc +blocksToTEI opts = vcat . map (blockToTEI opts) + +-- | Auxiliary function to convert Plain block to Para. +plainToPara :: Block -> Block +plainToPara (Plain x) = Para x +plainToPara x = x + +-- | Convert a list of pairs of terms and definitions into a TEI +-- list with labels and items. +deflistItemsToTEI :: WriterOptions -> [([Inline],[[Block]])] -> Doc +deflistItemsToTEI opts items = + vcat $ map (\(term, defs) -> deflistItemToTEI opts term defs) items + +-- | Convert a term and a list of blocks into a TEI varlistentry. +deflistItemToTEI :: WriterOptions -> [Inline] -> [[Block]] -> Doc +deflistItemToTEI opts term defs = + let def' = concatMap (map plainToPara) defs + in inTagsIndented "label" (inlinesToTEI opts term) $$ + inTagsIndented "item" (blocksToTEI opts def') + +-- | Convert a list of lists of blocks to a list of TEI list items. +listItemsToTEI :: WriterOptions -> [[Block]] -> Doc +listItemsToTEI opts items = vcat $ map (listItemToTEI opts) items + +-- | Convert a list of blocks into a TEI list item. +listItemToTEI :: WriterOptions -> [Block] -> Doc +listItemToTEI opts item = + inTagsIndented "item" $ blocksToTEI opts $ map plainToPara item + +imageToTEI :: WriterOptions -> Attr -> String -> Doc +imageToTEI _ attr src = selfClosingTag "graphic" $ + ("url", src) : idAndRole attr ++ dims + where + dims = go Width "width" ++ go Height "depth" + go dir dstr = case (dimension dir attr) of + Just a -> [(dstr, show a)] + Nothing -> [] + +-- | Convert a Pandoc block element to TEI. +blockToTEI :: WriterOptions -> Block -> Doc +blockToTEI _ Null = empty +-- Add ids to paragraphs in divs with ids - this is needed for +-- pandoc-citeproc to get link anchors in bibliographies: +blockToTEI opts (Div (ident,_,_) [Para lst]) = + let attribs = [("id", ident) | not (null ident)] in + inTags False "p" attribs $ inlinesToTEI opts lst +blockToTEI opts (Div _ bs) = blocksToTEI opts $ map plainToPara bs +blockToTEI _ (Header _ _ _) = empty -- should not occur after hierarchicalize +-- For TEI simple, text must be within containing block element, so +-- we use plainToPara to ensure that Plain text ends up contained by +-- something. +blockToTEI opts (Plain lst) = blockToTEI opts $ Para lst +-- title beginning with fig: indicates that the image is a figure +--blockToTEI opts (Para [Image attr txt (src,'f':'i':'g':':':_)]) = +-- let alt = inlinesToTEI opts txt +-- capt = if null txt +-- then empty +-- else inTagsSimple "title" alt +-- in inTagsIndented "figure" $ +-- capt $$ +-- (inTagsIndented "mediaobject" $ +-- (inTagsIndented "imageobject" +-- (imageToTEI opts attr src)) $$ +-- inTagsSimple "textobject" (inTagsSimple "phrase" alt)) +blockToTEI opts (Para lst) = + inTags False "p" [] $ inlinesToTEI opts lst +blockToTEI opts (LineBlock lns) = + blockToTEI opts $ linesToPara lns +blockToTEI opts (BlockQuote blocks) = + inTagsIndented "quote" $ blocksToTEI opts blocks +blockToTEI _ (CodeBlock (_,classes,_) str) = + text ("<ab type='codeblock " ++ lang ++ "'>") <> cr <> + flush (text (escapeStringForXML str) <> cr <> text "</ab>") + where lang = if null langs + then "" + else escapeStringForXML (head langs) + isLang l = map toLower l `elem` map (map toLower) languages + langsFrom s = if isLang s + then [s] + else languagesByExtension . map toLower $ s + langs = concatMap langsFrom classes +blockToTEI opts (BulletList lst) = + let attribs = [("type", "unordered")] + in inTags True "list" attribs $ listItemsToTEI opts lst +blockToTEI _ (OrderedList _ []) = empty +blockToTEI opts (OrderedList (start, numstyle, _) (first:rest)) = + let attribs = case numstyle of + DefaultStyle -> [] + Decimal -> [("type", "ordered:arabic")] + Example -> [("type", "ordered:arabic")] + UpperAlpha -> [("type", "ordered:upperalpha")] + LowerAlpha -> [("type", "ordered:loweralpha")] + UpperRoman -> [("type", "ordered:upperroman")] + LowerRoman -> [("type", "ordered:lowerroman")] + items = if start == 1 + then listItemsToTEI opts (first:rest) + else (inTags True "item" [("n",show start)] + (blocksToTEI opts $ map plainToPara first)) $$ + listItemsToTEI opts rest + in inTags True "list" attribs items +blockToTEI opts (DefinitionList lst) = + let attribs = [("type", "definition")] + in inTags True "list" attribs $ deflistItemsToTEI opts lst +blockToTEI _ (RawBlock f str) + | f == "tei" = text str -- raw TEI block (should such a thing exist). +-- | f == "html" = text str -- allow html for backwards compatibility + | otherwise = empty +blockToTEI _ HorizontalRule = + selfClosingTag "milestone" [("unit","undefined"), ("type","separator"),("rendition","line")] + +-- | TEI Tables +-- TEI Simple's tables are composed of cells and rows; other +-- table info in the AST is here lossily discard. +blockToTEI opts (Table _ _ _ headers rows) = + let + headers' = tableHeadersToTEI opts headers +-- headers' = if all null headers +-- then return empty +-- else tableRowToTEI opts headers + in + inTags True "table" [] $ + vcat $ [headers'] <> map (tableRowToTEI opts) rows + +tableRowToTEI :: WriterOptions + -> [[Block]] + -> Doc +tableRowToTEI opts cols = + inTagsIndented "row" $ vcat $ map (tableItemToTEI opts) cols + +tableHeadersToTEI :: WriterOptions + -> [[Block]] + -> Doc +tableHeadersToTEI opts cols = + inTags True "row" [("role","label")] $ vcat $ map (tableItemToTEI opts) cols + +tableItemToTEI :: WriterOptions + -> [Block] + -> Doc +tableItemToTEI opts item = + inTags False "cell" [] $ vcat $ map (blockToTEI opts) item + +-- | Convert a list of inline elements to TEI. +inlinesToTEI :: WriterOptions -> [Inline] -> Doc +inlinesToTEI opts lst = hcat $ map (inlineToTEI opts) lst + +-- | Convert an inline element to TEI. +inlineToTEI :: WriterOptions -> Inline -> Doc +inlineToTEI _ (Str str) = text $ escapeStringForXML str +inlineToTEI opts (Emph lst) = + inTags False "hi" [("rendition","simple:italic")] $ inlinesToTEI opts lst +inlineToTEI opts (Strong lst) = + inTags False "hi" [("rendition", "simple:bold")] $ inlinesToTEI opts lst +inlineToTEI opts (Strikeout lst) = + inTags False "hi" [("rendition", "simple:strikethrough")] $ + inlinesToTEI opts lst +inlineToTEI opts (Superscript lst) = + inTags False "hi" [("rendition", "simple:superscript")] $ inlinesToTEI opts lst +inlineToTEI opts (Subscript lst) = + inTags False "hi" [("rendition", "simple:subscript")] $ inlinesToTEI opts lst +inlineToTEI opts (SmallCaps lst) = + inTags False "hi" [("rendition", "simple:smallcaps")] $ + inlinesToTEI opts lst +inlineToTEI opts (Quoted _ lst) = + inTagsSimple "quote" $ inlinesToTEI opts lst +inlineToTEI opts (Cite _ lst) = + inlinesToTEI opts lst +inlineToTEI opts (Span _ ils) = + inlinesToTEI opts ils +inlineToTEI _ (Code _ str) = + inTags False "seg" [("type","code")] $ text (escapeStringForXML str) +-- Distinguish display from inline math by wrapping the former in a "figure." +inlineToTEI _ (Math t str) = + case t of + InlineMath -> inTags False "formula" [("notation","TeX")] $ + text (str) + DisplayMath -> inTags True "figure" [("type","math")] $ + inTags False "formula" [("notation","TeX")] $ text (str) + +inlineToTEI _ (RawInline f x) | f == "tei" = text x + | otherwise = empty +inlineToTEI _ LineBreak = selfClosingTag "lb" [] +inlineToTEI _ Space = space +-- because we use \n for LineBreak, we can't do soft breaks: +inlineToTEI _ SoftBreak = space +inlineToTEI opts (Link attr txt (src, _)) + | Just email <- stripPrefix "mailto:" src = + let emailLink = text $ + escapeStringForXML $ email + in case txt of + [Str s] | escapeURI s == email -> emailLink + _ -> inlinesToTEI opts txt <+> + char '(' <> emailLink <> char ')' + | otherwise = + (if isPrefixOf "#" src + then inTags False "ref" $ ("target", drop 1 src) : idAndRole attr + else inTags False "ref" $ ("target", src) : idAndRole attr ) $ + inlinesToTEI opts txt +inlineToTEI opts (Image attr description (src, tit)) = + let titleDoc = if null tit + then empty + else inTags False "figDesc" [] (text $ escapeStringForXML tit) + imageDesc = if null description + then empty + else inTags False "head" [] (inlinesToTEI opts description) + in inTagsIndented "figure" $ imageDesc $$ + imageToTEI opts attr src $$ titleDoc +inlineToTEI opts (Note contents) = + inTagsIndented "note" $ blocksToTEI opts contents + +idAndRole :: Attr -> [(String, String)] +idAndRole (id',cls,_) = ident ++ role + where + ident = if null id' + then [] + else [("id", id')] + role = if null cls + then [] + else [("role", unwords cls)] diff --git a/src/Text/Pandoc/Writers/Texinfo.hs b/src/Text/Pandoc/Writers/Texinfo.hs new file mode 100644 index 000000000..fe6024351 --- /dev/null +++ b/src/Text/Pandoc/Writers/Texinfo.hs @@ -0,0 +1,498 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2008-2015 John MacFarlane and Peter Wang + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Texinfo + Copyright : Copyright (C) 2008-2015 John MacFarlane and Peter Wang + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' format into Texinfo. +-} +module Text.Pandoc.Writers.Texinfo ( writeTexinfo ) where +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Templates (renderTemplate') +import Text.Printf ( printf ) +import Data.List ( transpose, maximumBy ) +import Data.Ord ( comparing ) +import Data.Char ( chr, ord ) +import Control.Monad.State +import Text.Pandoc.Pretty +import Text.Pandoc.ImageSize +import Network.URI ( isURI, unEscapeString ) +import System.FilePath +import qualified Data.Set as Set +import Control.Monad.Except (throwError) +import Text.Pandoc.Error +import Text.Pandoc.Class (PandocMonad, report) +import Text.Pandoc.Logging + +data WriterState = + WriterState { stStrikeout :: Bool -- document contains strikeout + , stSuperscript :: Bool -- document contains superscript + , stSubscript :: Bool -- document contains subscript + , stEscapeComma :: Bool -- in a context where we need @comma + , stIdentifiers :: Set.Set String -- header ids used already + , stOptions :: WriterOptions -- writer options + } + +{- TODO: + - internal cross references a la HTML + - generated .texi files don't work when run through texi2dvi + -} + +type TI m = StateT WriterState m + +-- | Convert Pandoc to Texinfo. +writeTexinfo :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeTexinfo options document = + evalStateT (pandocToTexinfo options $ wrapTop document) $ + WriterState { stStrikeout = False, stSuperscript = False, + stEscapeComma = False, stSubscript = False, + stIdentifiers = Set.empty, stOptions = options} + +-- | Add a "Top" node around the document, needed by Texinfo. +wrapTop :: Pandoc -> Pandoc +wrapTop (Pandoc meta blocks) = + Pandoc meta (Header 0 nullAttr (docTitle meta) : blocks) + +pandocToTexinfo :: PandocMonad m => WriterOptions -> Pandoc -> TI m String +pandocToTexinfo options (Pandoc meta blocks) = do + let titlePage = not $ all null + $ docTitle meta : docDate meta : docAuthors meta + let colwidth = if writerWrapText options == WrapAuto + then Just $ writerColumns options + else Nothing + metadata <- metaToJSON options + (fmap (render colwidth) . blockListToTexinfo) + (fmap (render colwidth) . inlineListToTexinfo) + meta + main <- blockListToTexinfo blocks + st <- get + let body = render colwidth main + let context = defField "body" body + $ defField "toc" (writerTableOfContents options) + $ defField "titlepage" titlePage + $ defField "subscript" (stSubscript st) + $ defField "superscript" (stSuperscript st) + $ defField "strikeout" (stStrikeout st) + $ metadata + case writerTemplate options of + Nothing -> return body + Just tpl -> return $ renderTemplate' tpl context + +-- | Escape things as needed for Texinfo. +stringToTexinfo :: String -> String +stringToTexinfo = escapeStringUsing texinfoEscapes + where texinfoEscapes = [ ('{', "@{") + , ('}', "@}") + , ('@', "@@") + , ('\160', "@ ") + , ('\x2014', "---") + , ('\x2013', "--") + , ('\x2026', "@dots{}") + , ('\x2019', "'") + ] + +escapeCommas :: PandocMonad m => TI m Doc -> TI m Doc +escapeCommas parser = do + oldEscapeComma <- gets stEscapeComma + modify $ \st -> st{ stEscapeComma = True } + res <- parser + modify $ \st -> st{ stEscapeComma = oldEscapeComma } + return res + +-- | Puts contents into Texinfo command. +inCmd :: String -> Doc -> Doc +inCmd cmd contents = char '@' <> text cmd <> braces contents + +-- | Convert Pandoc block element to Texinfo. +blockToTexinfo :: PandocMonad m + => Block -- ^ Block to convert + -> TI m Doc + +blockToTexinfo Null = return empty + +blockToTexinfo (Div _ bs) = blockListToTexinfo bs + +blockToTexinfo (Plain lst) = + inlineListToTexinfo lst + +-- title beginning with fig: indicates that the image is a figure +blockToTexinfo (Para [Image attr txt (src,'f':'i':'g':':':tit)]) = do + capt <- if null txt + then return empty + else (\c -> text "@caption" <> braces c) `fmap` + inlineListToTexinfo txt + img <- inlineToTexinfo (Image attr txt (src,tit)) + return $ text "@float" $$ img $$ capt $$ text "@end float" + +blockToTexinfo (Para lst) = + inlineListToTexinfo lst -- this is handled differently from Plain in blockListToTexinfo + +blockToTexinfo (LineBlock lns) = + blockToTexinfo $ linesToPara lns + +blockToTexinfo (BlockQuote lst) = do + contents <- blockListToTexinfo lst + return $ text "@quotation" $$ + contents $$ + text "@end quotation" + +blockToTexinfo (CodeBlock _ str) = do + return $ blankline $$ + text "@verbatim" $$ + flush (text str) $$ + text "@end verbatim" <> blankline + +blockToTexinfo b@(RawBlock f str) + | f == "texinfo" = return $ text str + | f == "latex" || f == "tex" = + return $ text "@tex" $$ text str $$ text "@end tex" + | otherwise = do + report $ BlockNotRendered b + return empty + +blockToTexinfo (BulletList lst) = do + items <- mapM listItemToTexinfo lst + return $ text "@itemize" $$ + vcat items $$ + text "@end itemize" <> blankline + +blockToTexinfo (OrderedList (start, numstyle, _) lst) = do + items <- mapM listItemToTexinfo lst + return $ text "@enumerate " <> exemplar $$ + vcat items $$ + text "@end enumerate" <> blankline + where + exemplar = case numstyle of + DefaultStyle -> decimal + Decimal -> decimal + Example -> decimal + UpperRoman -> decimal -- Roman numerals not supported + LowerRoman -> decimal + UpperAlpha -> upperAlpha + LowerAlpha -> lowerAlpha + decimal = if start == 1 + then empty + else text (show start) + upperAlpha = text [chr $ ord 'A' + start - 1] + lowerAlpha = text [chr $ ord 'a' + start - 1] + +blockToTexinfo (DefinitionList lst) = do + items <- mapM defListItemToTexinfo lst + return $ text "@table @asis" $$ + vcat items $$ + text "@end table" <> blankline + +blockToTexinfo HorizontalRule = + -- XXX can't get the equivalent from LaTeX.hs to work + return $ text "@iftex" $$ + text "@bigskip@hrule@bigskip" $$ + text "@end iftex" $$ + text "@ifnottex" $$ + text (take 72 $ repeat '-') $$ + text "@end ifnottex" + +blockToTexinfo (Header 0 _ lst) = do + txt <- if null lst + then return $ text "Top" + else inlineListToTexinfo lst + return $ text "@node Top" $$ + text "@top " <> txt <> blankline + +blockToTexinfo (Header level _ lst) + | level < 1 || level > 4 = blockToTexinfo (Para lst) + | otherwise = do + node <- inlineListForNode lst + txt <- inlineListToTexinfo lst + idsUsed <- gets stIdentifiers + let id' = uniqueIdent lst idsUsed + modify $ \st -> st{ stIdentifiers = Set.insert id' idsUsed } + sec <- seccmd level + return $ if (level > 0) && (level <= 4) + then blankline <> text "@node " <> node $$ + text sec <> txt $$ + text "@anchor" <> braces (text $ '#':id') + else txt + where + seccmd :: PandocMonad m => Int -> TI m String + seccmd 1 = return "@chapter " + seccmd 2 = return "@section " + seccmd 3 = return "@subsection " + seccmd 4 = return "@subsubsection " + seccmd _ = throwError $ PandocSomeError "illegal seccmd level" + +blockToTexinfo (Table caption aligns widths heads rows) = do + headers <- if all null heads + then return empty + else tableHeadToTexinfo aligns heads + captionText <- inlineListToTexinfo caption + rowsText <- mapM (tableRowToTexinfo aligns) rows + colDescriptors <- + if all (== 0) widths + then do -- use longest entry instead of column widths + cols <- mapM (mapM (liftM (render Nothing . hcat) . mapM blockToTexinfo)) $ + transpose $ heads : rows + return $ concatMap ((\x -> "{"++x++"} ") . maximumBy (comparing length)) cols + else return $ "@columnfractions " ++ concatMap (printf "%.2f ") widths + let tableBody = text ("@multitable " ++ colDescriptors) $$ + headers $$ + vcat rowsText $$ + text "@end multitable" + return $ if isEmpty captionText + then tableBody <> blankline + else text "@float" $$ + tableBody $$ + inCmd "caption" captionText $$ + text "@end float" + +tableHeadToTexinfo :: PandocMonad m + => [Alignment] + -> [[Block]] + -> TI m Doc +tableHeadToTexinfo = tableAnyRowToTexinfo "@headitem " + +tableRowToTexinfo :: PandocMonad m + => [Alignment] + -> [[Block]] + -> TI m Doc +tableRowToTexinfo = tableAnyRowToTexinfo "@item " + +tableAnyRowToTexinfo :: PandocMonad m + => String + -> [Alignment] + -> [[Block]] + -> TI m Doc +tableAnyRowToTexinfo itemtype aligns cols = + zipWithM alignedBlock aligns cols >>= + return . (text itemtype $$) . foldl (\row item -> row $$ + (if isEmpty row then empty else text " @tab ") <> item) empty + +alignedBlock :: PandocMonad m + => Alignment + -> [Block] + -> TI m Doc +-- XXX @flushleft and @flushright text won't get word wrapped. Since word +-- wrapping is more important than alignment, we ignore the alignment. +alignedBlock _ = blockListToTexinfo +{- +alignedBlock AlignLeft col = do + b <- blockListToTexinfo col + return $ text "@flushleft" $$ b $$ text "@end flushleft" +alignedBlock AlignRight col = do + b <- blockListToTexinfo col + return $ text "@flushright" $$ b $$ text "@end flushright" +alignedBlock _ col = blockListToTexinfo col +-} + +-- | Convert Pandoc block elements to Texinfo. +blockListToTexinfo :: PandocMonad m + => [Block] + -> TI m Doc +blockListToTexinfo [] = return empty +blockListToTexinfo (x:xs) = do + x' <- blockToTexinfo x + case x of + Header level _ _ -> do + -- We need need to insert a menu for this node. + let (before, after) = break isHeaderBlock xs + before' <- blockListToTexinfo before + let menu = if level < 4 + then collectNodes (level + 1) after + else [] + lines' <- mapM makeMenuLine menu + let menu' = if null lines' + then empty + else text "@menu" $$ + vcat lines' $$ + text "@end menu" + after' <- blockListToTexinfo after + return $ x' $$ before' $$ menu' $$ after' + Para _ -> do + xs' <- blockListToTexinfo xs + case xs of + ((CodeBlock _ _):_) -> return $ x' $$ xs' + _ -> return $ x' $+$ xs' + _ -> do + xs' <- blockListToTexinfo xs + return $ x' $$ xs' + +collectNodes :: Int -> [Block] -> [Block] +collectNodes _ [] = [] +collectNodes level (x:xs) = + case x of + (Header hl _ _) -> + if hl < level + then [] + else if hl == level + then x : collectNodes level xs + else collectNodes level xs + _ -> + collectNodes level xs + +makeMenuLine :: PandocMonad m + => Block + -> TI m Doc +makeMenuLine (Header _ _ lst) = do + txt <- inlineListForNode lst + return $ text "* " <> txt <> text "::" +makeMenuLine _ = throwError $ PandocSomeError "makeMenuLine called with non-Header block" + +listItemToTexinfo :: PandocMonad m + => [Block] + -> TI m Doc +listItemToTexinfo lst = do + contents <- blockListToTexinfo lst + let spacer = case reverse lst of + (Para{}:_) -> blankline + _ -> empty + return $ text "@item" $$ contents <> spacer + +defListItemToTexinfo :: PandocMonad m + => ([Inline], [[Block]]) + -> TI m Doc +defListItemToTexinfo (term, defs) = do + term' <- inlineListToTexinfo term + let defToTexinfo bs = do d <- blockListToTexinfo bs + case reverse bs of + (Para{}:_) -> return $ d <> blankline + _ -> return d + defs' <- mapM defToTexinfo defs + return $ text "@item " <> term' $+$ vcat defs' + +-- | Convert list of inline elements to Texinfo. +inlineListToTexinfo :: PandocMonad m + => [Inline] -- ^ Inlines to convert + -> TI m Doc +inlineListToTexinfo lst = mapM inlineToTexinfo lst >>= return . hcat + +-- | Convert list of inline elements to Texinfo acceptable for a node name. +inlineListForNode :: PandocMonad m + => [Inline] -- ^ Inlines to convert + -> TI m Doc +inlineListForNode = return . text . stringToTexinfo . + filter (not . disallowedInNode) . stringify + +-- periods, commas, colons, and parentheses are disallowed in node names +disallowedInNode :: Char -> Bool +disallowedInNode c = c `elem` (".,:()" :: String) + +-- | Convert inline element to Texinfo +inlineToTexinfo :: PandocMonad m + => Inline -- ^ Inline to convert + -> TI m Doc + +inlineToTexinfo (Span _ lst) = + inlineListToTexinfo lst + +inlineToTexinfo (Emph lst) = + inlineListToTexinfo lst >>= return . inCmd "emph" + +inlineToTexinfo (Strong lst) = + inlineListToTexinfo lst >>= return . inCmd "strong" + +inlineToTexinfo (Strikeout lst) = do + modify $ \st -> st{ stStrikeout = True } + contents <- inlineListToTexinfo lst + return $ text "@textstrikeout{" <> contents <> text "}" + +inlineToTexinfo (Superscript lst) = do + modify $ \st -> st{ stSuperscript = True } + contents <- inlineListToTexinfo lst + return $ text "@textsuperscript{" <> contents <> char '}' + +inlineToTexinfo (Subscript lst) = do + modify $ \st -> st{ stSubscript = True } + contents <- inlineListToTexinfo lst + return $ text "@textsubscript{" <> contents <> char '}' + +inlineToTexinfo (SmallCaps lst) = + inlineListToTexinfo lst >>= return . inCmd "sc" + +inlineToTexinfo (Code _ str) = do + return $ text $ "@code{" ++ stringToTexinfo str ++ "}" + +inlineToTexinfo (Quoted SingleQuote lst) = do + contents <- inlineListToTexinfo lst + return $ char '`' <> contents <> char '\'' + +inlineToTexinfo (Quoted DoubleQuote lst) = do + contents <- inlineListToTexinfo lst + return $ text "``" <> contents <> text "''" + +inlineToTexinfo (Cite _ lst) = + inlineListToTexinfo lst +inlineToTexinfo (Str str) = return $ text (stringToTexinfo str) +inlineToTexinfo (Math _ str) = return $ inCmd "math" $ text str +inlineToTexinfo il@(RawInline f str) + | f == "latex" || f == "tex" = + return $ text "@tex" $$ text str $$ text "@end tex" + | f == "texinfo" = return $ text str + | otherwise = do + report $ InlineNotRendered il + return empty +inlineToTexinfo (LineBreak) = return $ text "@*" <> cr +inlineToTexinfo SoftBreak = do + wrapText <- gets (writerWrapText . stOptions) + case wrapText of + WrapAuto -> return space + WrapNone -> return space + WrapPreserve -> return cr +inlineToTexinfo Space = return space + +inlineToTexinfo (Link _ txt (src@('#':_), _)) = do + contents <- escapeCommas $ inlineListToTexinfo txt + return $ text "@ref" <> + braces (text (stringToTexinfo src) <> text "," <> contents) +inlineToTexinfo (Link _ txt (src, _)) = do + case txt of + [Str x] | escapeURI x == src -> -- autolink + do return $ text $ "@url{" ++ x ++ "}" + _ -> do contents <- escapeCommas $ inlineListToTexinfo txt + let src1 = stringToTexinfo src + return $ text ("@uref{" ++ src1 ++ ",") <> contents <> + char '}' + +inlineToTexinfo (Image attr alternate (source, _)) = do + content <- escapeCommas $ inlineListToTexinfo alternate + opts <- gets stOptions + let showDim dim = case (dimension dim attr) of + (Just (Pixel a)) -> showInInch opts (Pixel a) ++ "in" + (Just (Percent _)) -> "" + (Just d) -> show d + Nothing -> "" + return $ text ("@image{" ++ base ++ ',':(showDim Width) ++ ',':(showDim Height) ++ ",") + <> content <> text "," <> text (ext ++ "}") + where + ext = drop 1 $ takeExtension source' + base = dropExtension source' + source' = if isURI source + then source + else unEscapeString source + +inlineToTexinfo (Note contents) = do + contents' <- blockListToTexinfo contents + return $ text "@footnote" <> braces contents' diff --git a/src/Text/Pandoc/Writers/Textile.hs b/src/Text/Pandoc/Writers/Textile.hs new file mode 100644 index 000000000..45f1780cf --- /dev/null +++ b/src/Text/Pandoc/Writers/Textile.hs @@ -0,0 +1,486 @@ +{- +Copyright (C) 2010-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Textile + Copyright : Copyright (C) 2010-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to Textile markup. + +Textile: <http://thresholdstate.com/articles/4312/the-textile-reference-manual> +-} +module Text.Pandoc.Writers.Textile ( writeTextile ) where +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Shared +import Text.Pandoc.Pretty (render) +import Text.Pandoc.ImageSize +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Templates (renderTemplate') +import Text.Pandoc.XML ( escapeStringForXML ) +import Data.List ( intercalate ) +import Control.Monad.State +import Data.Char ( isSpace ) +import Text.Pandoc.Class ( PandocMonad ) + +data WriterState = WriterState { + stNotes :: [String] -- Footnotes + , stListLevel :: [Char] -- String at beginning of list items, e.g. "**" + , stStartNum :: Maybe Int -- Start number if first list item + , stUseTags :: Bool -- True if we should use HTML tags because we're in a complex list + } + +-- | Convert Pandoc to Textile. +writeTextile :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeTextile opts document = return $ + evalState (pandocToTextile opts document) + WriterState { stNotes = [], stListLevel = [], stStartNum = Nothing, + stUseTags = False } + +-- | Return Textile representation of document. +pandocToTextile :: WriterOptions -> Pandoc -> State WriterState String +pandocToTextile opts (Pandoc meta blocks) = do + metadata <- metaToJSON opts (blockListToTextile opts) + (inlineListToTextile opts) meta + body <- blockListToTextile opts blocks + notes <- liftM (unlines . reverse . stNotes) get + let main = body ++ if null notes then "" else ("\n\n" ++ notes) + let context = defField "body" main metadata + case writerTemplate opts of + Nothing -> return main + Just tpl -> return $ renderTemplate' tpl context + +withUseTags :: State WriterState a -> State WriterState a +withUseTags action = do + oldUseTags <- liftM stUseTags get + modify $ \s -> s { stUseTags = True } + result <- action + modify $ \s -> s { stUseTags = oldUseTags } + return result + +-- | Escape one character as needed for Textile. +escapeCharForTextile :: Char -> String +escapeCharForTextile x = case x of + '&' -> "&" + '<' -> "<" + '>' -> ">" + '"' -> """ + '*' -> "*" + '_' -> "_" + '@' -> "@" + '+' -> "+" + '-' -> "-" + '|' -> "|" + '\x2014' -> " -- " + '\x2013' -> " - " + '\x2019' -> "'" + '\x2026' -> "..." + c -> [c] + +-- | Escape string as needed for Textile. +escapeStringForTextile :: String -> String +escapeStringForTextile = concatMap escapeCharForTextile + +-- | Convert Pandoc block element to Textile. +blockToTextile :: WriterOptions -- ^ Options + -> Block -- ^ Block element + -> State WriterState String + +blockToTextile _ Null = return "" + +blockToTextile opts (Div attr bs) = do + let startTag = render Nothing $ tagWithAttrs "div" attr + let endTag = "</div>" + contents <- blockListToTextile opts bs + return $ startTag ++ "\n\n" ++ contents ++ "\n\n" ++ endTag ++ "\n" + +blockToTextile opts (Plain inlines) = + inlineListToTextile opts inlines + +-- title beginning with fig: indicates that the image is a figure +blockToTextile opts (Para [Image attr txt (src,'f':'i':'g':':':tit)]) = do + capt <- blockToTextile opts (Para txt) + im <- inlineToTextile opts (Image attr txt (src,tit)) + return $ im ++ "\n" ++ capt + +blockToTextile opts (Para inlines) = do + useTags <- liftM stUseTags get + listLevel <- liftM stListLevel get + contents <- inlineListToTextile opts inlines + return $ if useTags + then "<p>" ++ contents ++ "</p>" + else contents ++ if null listLevel then "\n" else "" + +blockToTextile opts (LineBlock lns) = + blockToTextile opts $ linesToPara lns + +blockToTextile _ (RawBlock f str) + | f == Format "html" || f == Format "textile" = return str + | otherwise = return "" + +blockToTextile _ HorizontalRule = return "<hr />\n" + +blockToTextile opts (Header level (ident,classes,keyvals) inlines) = do + contents <- inlineListToTextile opts inlines + let identAttr = if null ident then "" else ('#':ident) + let attribs = if null identAttr && null classes + then "" + else "(" ++ unwords classes ++ identAttr ++ ")" + let lang = maybe "" (\x -> "[" ++ x ++ "]") $ lookup "lang" keyvals + let styles = maybe "" (\x -> "{" ++ x ++ "}") $ lookup "style" keyvals + let prefix = 'h' : show level ++ attribs ++ styles ++ lang ++ ". " + return $ prefix ++ contents ++ "\n" + +blockToTextile _ (CodeBlock (_,classes,_) str) | any (all isSpace) (lines str) = + return $ "<pre" ++ classes' ++ ">\n" ++ escapeStringForXML str ++ + "\n</pre>\n" + where classes' = if null classes + then "" + else " class=\"" ++ unwords classes ++ "\"" + +blockToTextile _ (CodeBlock (_,classes,_) str) = + return $ "bc" ++ classes' ++ ". " ++ str ++ "\n\n" + where classes' = if null classes + then "" + else "(" ++ unwords classes ++ ")" + +blockToTextile opts (BlockQuote bs@[Para _]) = do + contents <- blockListToTextile opts bs + return $ "bq. " ++ contents ++ "\n\n" + +blockToTextile opts (BlockQuote blocks) = do + contents <- blockListToTextile opts blocks + return $ "<blockquote>\n\n" ++ contents ++ "\n</blockquote>\n" + +blockToTextile opts (Table [] aligns widths headers rows') | + all (==0) widths = do + hs <- mapM (liftM (("_. " ++) . stripTrailingNewlines) . blockListToTextile opts) headers + let cellsToRow cells = "|" ++ intercalate "|" cells ++ "|" + let header = if all null headers then "" else cellsToRow hs ++ "\n" + let blocksToCell (align, bs) = do + contents <- stripTrailingNewlines <$> blockListToTextile opts bs + let alignMarker = case align of + AlignLeft -> "<. " + AlignRight -> ">. " + AlignCenter -> "=. " + AlignDefault -> "" + return $ alignMarker ++ contents + let rowToCells = mapM blocksToCell . zip aligns + bs <- mapM rowToCells rows' + let body = unlines $ map cellsToRow bs + return $ header ++ body + +blockToTextile opts (Table capt aligns widths headers rows') = do + let alignStrings = map alignmentToString aligns + captionDoc <- if null capt + then return "" + else do + c <- inlineListToTextile opts capt + return $ "<caption>" ++ c ++ "</caption>\n" + let percent w = show (truncate (100*w) :: Integer) ++ "%" + let coltags = if all (== 0.0) widths + then "" + else unlines $ map + (\w -> "<col width=\"" ++ percent w ++ "\" />") widths + head' <- if all null headers + then return "" + else do + hs <- tableRowToTextile opts alignStrings 0 headers + return $ "<thead>\n" ++ hs ++ "\n</thead>\n" + body' <- zipWithM (tableRowToTextile opts alignStrings) [1..] rows' + return $ "<table>\n" ++ captionDoc ++ coltags ++ head' ++ + "<tbody>\n" ++ unlines body' ++ "</tbody>\n</table>\n" + +blockToTextile opts x@(BulletList items) = do + oldUseTags <- liftM stUseTags get + let useTags = oldUseTags || not (isSimpleList x) + if useTags + then do + contents <- withUseTags $ mapM (listItemToTextile opts) items + return $ "<ul>\n" ++ vcat contents ++ "\n</ul>\n" + else do + modify $ \s -> s { stListLevel = stListLevel s ++ "*" } + level <- get >>= return . length . stListLevel + contents <- mapM (listItemToTextile opts) items + modify $ \s -> s { stListLevel = init (stListLevel s) } + return $ vcat contents ++ (if level > 1 then "" else "\n") + +blockToTextile opts x@(OrderedList attribs@(start, _, _) items) = do + oldUseTags <- liftM stUseTags get + let useTags = oldUseTags || not (isSimpleList x) + if useTags + then do + contents <- withUseTags $ mapM (listItemToTextile opts) items + return $ "<ol" ++ listAttribsToString attribs ++ ">\n" ++ vcat contents ++ + "\n</ol>\n" + else do + modify $ \s -> s { stListLevel = stListLevel s ++ "#" + , stStartNum = if start > 1 + then Just start + else Nothing } + level <- get >>= return . length . stListLevel + contents <- mapM (listItemToTextile opts) items + modify $ \s -> s { stListLevel = init (stListLevel s), + stStartNum = Nothing } + return $ vcat contents ++ (if level > 1 then "" else "\n") + +blockToTextile opts (DefinitionList items) = do + contents <- withUseTags $ mapM (definitionListItemToTextile opts) items + return $ "<dl>\n" ++ vcat contents ++ "\n</dl>\n" + +-- Auxiliary functions for lists: + +-- | Convert ordered list attributes to HTML attribute string +listAttribsToString :: ListAttributes -> String +listAttribsToString (startnum, numstyle, _) = + let numstyle' = camelCaseToHyphenated $ show numstyle + in (if startnum /= 1 + then " start=\"" ++ show startnum ++ "\"" + else "") ++ + (if numstyle /= DefaultStyle + then " style=\"list-style-type: " ++ numstyle' ++ ";\"" + else "") + +-- | Convert bullet or ordered list item (list of blocks) to Textile. +listItemToTextile :: WriterOptions -> [Block] -> State WriterState String +listItemToTextile opts items = do + contents <- blockListToTextile opts items + useTags <- get >>= return . stUseTags + if useTags + then return $ "<li>" ++ contents ++ "</li>" + else do + marker <- gets stListLevel + mbstart <- gets stStartNum + case mbstart of + Just n -> do + modify $ \s -> s{ stStartNum = Nothing } + return $ marker ++ show n ++ " " ++ contents + Nothing -> return $ marker ++ " " ++ contents + +-- | Convert definition list item (label, list of blocks) to Textile. +definitionListItemToTextile :: WriterOptions + -> ([Inline],[[Block]]) + -> State WriterState String +definitionListItemToTextile opts (label, items) = do + labelText <- inlineListToTextile opts label + contents <- mapM (blockListToTextile opts) items + return $ "<dt>" ++ labelText ++ "</dt>\n" ++ + (intercalate "\n" $ map (\d -> "<dd>" ++ d ++ "</dd>") contents) + +-- | True if the list can be handled by simple wiki markup, False if HTML tags will be needed. +isSimpleList :: Block -> Bool +isSimpleList x = + case x of + BulletList items -> all isSimpleListItem items + OrderedList (_, sty, _) items -> all isSimpleListItem items && + sty `elem` [DefaultStyle, Decimal] + _ -> False + +-- | True if list item can be handled with the simple wiki syntax. False if +-- HTML tags will be needed. +isSimpleListItem :: [Block] -> Bool +isSimpleListItem [] = True +isSimpleListItem [x] = + case x of + Plain _ -> True + Para _ -> True + BulletList _ -> isSimpleList x + OrderedList _ _ -> isSimpleList x + _ -> False +isSimpleListItem [x, y] | isPlainOrPara x = + case y of + BulletList _ -> isSimpleList y + OrderedList _ _ -> isSimpleList y + _ -> False +isSimpleListItem _ = False + +isPlainOrPara :: Block -> Bool +isPlainOrPara (Plain _) = True +isPlainOrPara (Para _) = True +isPlainOrPara _ = False + +-- | Concatenates strings with line breaks between them. +vcat :: [String] -> String +vcat = intercalate "\n" + +-- Auxiliary functions for tables. (TODO: these are common to HTML, MediaWiki, +-- and Textile writers, and should be abstracted out.) + +tableRowToTextile :: WriterOptions + -> [String] + -> Int + -> [[Block]] + -> State WriterState String +tableRowToTextile opts alignStrings rownum cols' = do + let celltype = if rownum == 0 then "th" else "td" + let rowclass = case rownum of + 0 -> "header" + x | x `rem` 2 == 1 -> "odd" + _ -> "even" + cols'' <- sequence $ zipWith + (\alignment item -> tableItemToTextile opts celltype alignment item) + alignStrings cols' + return $ "<tr class=\"" ++ rowclass ++ "\">\n" ++ unlines cols'' ++ "</tr>" + +alignmentToString :: Alignment -> [Char] +alignmentToString alignment = case alignment of + AlignLeft -> "left" + AlignRight -> "right" + AlignCenter -> "center" + AlignDefault -> "left" + +tableItemToTextile :: WriterOptions + -> String + -> String + -> [Block] + -> State WriterState String +tableItemToTextile opts celltype align' item = do + let mkcell x = "<" ++ celltype ++ " align=\"" ++ align' ++ "\">" ++ + x ++ "</" ++ celltype ++ ">" + contents <- blockListToTextile opts item + return $ mkcell contents + +-- | Convert list of Pandoc block elements to Textile. +blockListToTextile :: WriterOptions -- ^ Options + -> [Block] -- ^ List of block elements + -> State WriterState String +blockListToTextile opts blocks = + mapM (blockToTextile opts) blocks >>= return . vcat + +-- | Convert list of Pandoc inline elements to Textile. +inlineListToTextile :: WriterOptions -> [Inline] -> State WriterState String +inlineListToTextile opts lst = + mapM (inlineToTextile opts) lst >>= return . concat + +-- | Convert Pandoc inline element to Textile. +inlineToTextile :: WriterOptions -> Inline -> State WriterState String + +inlineToTextile opts (Span _ lst) = + inlineListToTextile opts lst + +inlineToTextile opts (Emph lst) = do + contents <- inlineListToTextile opts lst + return $ if '_' `elem` contents + then "<em>" ++ contents ++ "</em>" + else "_" ++ contents ++ "_" + +inlineToTextile opts (Strong lst) = do + contents <- inlineListToTextile opts lst + return $ if '*' `elem` contents + then "<strong>" ++ contents ++ "</strong>" + else "*" ++ contents ++ "*" + +inlineToTextile opts (Strikeout lst) = do + contents <- inlineListToTextile opts lst + return $ if '-' `elem` contents + then "<del>" ++ contents ++ "</del>" + else "-" ++ contents ++ "-" + +inlineToTextile opts (Superscript lst) = do + contents <- inlineListToTextile opts lst + return $ if '^' `elem` contents + then "<sup>" ++ contents ++ "</sup>" + else "[^" ++ contents ++ "^]" + +inlineToTextile opts (Subscript lst) = do + contents <- inlineListToTextile opts lst + return $ if '~' `elem` contents + then "<sub>" ++ contents ++ "</sub>" + else "[~" ++ contents ++ "~]" + +inlineToTextile opts (SmallCaps lst) = inlineListToTextile opts lst + +inlineToTextile opts (Quoted SingleQuote lst) = do + contents <- inlineListToTextile opts lst + return $ "'" ++ contents ++ "'" + +inlineToTextile opts (Quoted DoubleQuote lst) = do + contents <- inlineListToTextile opts lst + return $ "\"" ++ contents ++ "\"" + +inlineToTextile opts (Cite _ lst) = inlineListToTextile opts lst + +inlineToTextile _ (Code _ str) = + return $ if '@' `elem` str + then "<tt>" ++ escapeStringForXML str ++ "</tt>" + else "@" ++ str ++ "@" + +inlineToTextile _ (Str str) = return $ escapeStringForTextile str + +inlineToTextile _ (Math _ str) = + return $ "<span class=\"math\">" ++ escapeStringForXML str ++ "</math>" + +inlineToTextile opts (RawInline f str) + | f == Format "html" || f == Format "textile" = return str + | (f == Format "latex" || f == Format "tex") && + isEnabled Ext_raw_tex opts = return str + | otherwise = return "" + +inlineToTextile _ LineBreak = return "\n" + +inlineToTextile _ SoftBreak = return " " + +inlineToTextile _ Space = return " " + +inlineToTextile opts (Link (_, cls, _) txt (src, _)) = do + let classes = if null cls + then "" + else "(" ++ unwords cls ++ ")" + label <- case txt of + [Code _ s] + | s == src -> return "$" + [Str s] + | s == src -> return "$" + _ -> inlineListToTextile opts txt + return $ "\"" ++ classes ++ label ++ "\":" ++ src + +inlineToTextile opts (Image attr@(_, cls, _) alt (source, tit)) = do + alt' <- inlineListToTextile opts alt + let txt = if null tit + then if null alt' + then "" + else "(" ++ alt' ++ ")" + else "(" ++ tit ++ ")" + classes = if null cls + then "" + else "(" ++ unwords cls ++ ")" + showDim dir = let toCss str = Just $ show dir ++ ":" ++ str ++ ";" + in case (dimension dir attr) of + Just (Percent a) -> toCss $ show (Percent a) + Just dim -> toCss $ showInPixel opts dim ++ "px" + Nothing -> Nothing + styles = case (showDim Width, showDim Height) of + (Just w, Just h) -> "{" ++ w ++ h ++ "}" + (Just w, Nothing) -> "{" ++ w ++ "height:auto;}" + (Nothing, Just h) -> "{" ++ "width:auto;" ++ h ++ "}" + (Nothing, Nothing) -> "" + return $ "!" ++ classes ++ styles ++ source ++ txt ++ "!" + +inlineToTextile opts (Note contents) = do + curNotes <- liftM stNotes get + let newnum = length curNotes + 1 + contents' <- blockListToTextile opts contents + let thisnote = "fn" ++ show newnum ++ ". " ++ contents' ++ "\n" + modify $ \s -> s { stNotes = thisnote : curNotes } + return $ "[" ++ show newnum ++ "]" + -- note - may not work for notes with multiple blocks diff --git a/src/Text/Pandoc/Writers/ZimWiki.hs b/src/Text/Pandoc/Writers/ZimWiki.hs new file mode 100644 index 000000000..d01ce0e8b --- /dev/null +++ b/src/Text/Pandoc/Writers/ZimWiki.hs @@ -0,0 +1,396 @@ +{- +Copyright (C) 2008-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.ZimWiki + Copyright : Copyright (C) 2008-2015 John MacFarlane, 2017 Alex Ivkin + License : GNU GPL, version 2 or above + + Maintainer : Alex Ivkin <alex@ivkin.net> + Stability : beta + Portability : portable + +Conversion of 'Pandoc' documents to ZimWiki markup. + +http://zim-wiki.org/manual/Help/Wiki_Syntax.html +-} + +module Text.Pandoc.Writers.ZimWiki ( writeZimWiki ) where +import Text.Pandoc.Definition +import Text.Pandoc.Options ( WriterOptions(writerTableOfContents, writerTemplate, writerWrapText), WrapOption(..) ) +import Text.Pandoc.Shared ( escapeURI, linesToPara, removeFormatting, trimr + , substitute ) +import Text.Pandoc.Writers.Shared ( defField, metaToJSON ) +import Text.Pandoc.ImageSize +import Text.Pandoc.Templates ( renderTemplate' ) +import Data.List ( intercalate, isPrefixOf, transpose, isInfixOf ) +import Data.Text ( breakOnAll, pack ) +import Data.Default (Default(..)) +import Network.URI ( isURI ) +import Control.Monad ( zipWithM ) +import Control.Monad.State ( modify, State, get, evalState ) +import Text.Pandoc.Class ( PandocMonad ) +import qualified Data.Map as Map + +data WriterState = WriterState { + stItemNum :: Int, + stIndent :: String, -- Indent after the marker at the beginning of list items + stInTable :: Bool, -- Inside a table + stInLink :: Bool -- Inside a link description + } + +instance Default WriterState where + def = WriterState { stItemNum = 1, stIndent = "", stInTable = False, stInLink = False } + +-- | Convert Pandoc to ZimWiki. +writeZimWiki :: PandocMonad m => WriterOptions -> Pandoc -> m String +writeZimWiki opts document = return $ evalState (pandocToZimWiki opts document) def + +-- | Return ZimWiki representation of document. +pandocToZimWiki :: WriterOptions -> Pandoc -> State WriterState String +pandocToZimWiki opts (Pandoc meta blocks) = do + metadata <- metaToJSON opts + (fmap trimr . blockListToZimWiki opts) + (inlineListToZimWiki opts) + meta + body <- blockListToZimWiki opts blocks + --let header = "Content-Type: text/x-zim-wiki\nWiki-Format: zim 0.4\n" + let main = body + let context = defField "body" main + $ defField "toc" (writerTableOfContents opts) + $ metadata + case writerTemplate opts of + Just tpl -> return $ renderTemplate' tpl context + Nothing -> return main + +-- | Escape special characters for ZimWiki. +escapeString :: String -> String +escapeString = substitute "__" "''__''" . + substitute "**" "''**''" . + substitute "~~" "''~~''" . + substitute "//" "''//''" + +-- | Convert Pandoc block element to ZimWiki. +blockToZimWiki :: WriterOptions -> Block -> State WriterState String + +blockToZimWiki _ Null = return "" + +blockToZimWiki opts (Div _attrs bs) = do + contents <- blockListToZimWiki opts bs + return $ contents ++ "\n" + +blockToZimWiki opts (Plain inlines) = inlineListToZimWiki opts inlines + +-- title beginning with fig: indicates that the image is a figure +-- ZimWiki doesn't support captions - so combine together alt and caption into alt +blockToZimWiki opts (Para [Image attr txt (src,'f':'i':'g':':':tit)]) = do + capt <- if null txt + then return "" + else (" " ++) `fmap` inlineListToZimWiki opts txt + let opt = if null txt + then "" + else "|" ++ if null tit then capt else tit ++ capt + -- Relative links fail isURI and receive a colon + prefix = if isURI src then "" else ":" + return $ "{{" ++ prefix ++ src ++ imageDims opts attr ++ opt ++ "}}\n" + +blockToZimWiki opts (Para inlines) = do + indent <- stIndent <$> get + -- useTags <- stUseTags <$> get + contents <- inlineListToZimWiki opts inlines + return $ contents ++ if null indent then "\n" else "" + +blockToZimWiki opts (LineBlock lns) = do + blockToZimWiki opts $ linesToPara lns + +blockToZimWiki opts (RawBlock f str) + | f == Format "zimwiki" = return str + | f == Format "html" = do cont <- indentFromHTML opts str; return cont + | otherwise = return "" + +blockToZimWiki _ HorizontalRule = return "\n----\n" + +blockToZimWiki opts (Header level _ inlines) = do + contents <- inlineListToZimWiki opts $ removeFormatting inlines -- emphasis, links etc. not allowed in headers + let eqs = replicate ( 7 - level ) '=' + return $ eqs ++ " " ++ contents ++ " " ++ eqs ++ "\n" + +blockToZimWiki _ (CodeBlock (_,classes,_) str) = do + -- Remap languages into the gtksourceview2 convention that ZimWiki source code plugin is using + let langal = [("javascript", "js"), ("bash", "sh"), ("winbatch", "dosbatch")] + let langmap = Map.fromList langal + return $ case classes of + [] -> "'''\n" ++ cleanupCode str ++ "\n'''\n" -- turn no lang block into a quote block + (x:_) -> "{{{code: lang=\"" ++ + (case Map.lookup x langmap of + Nothing -> x + Just y -> y) ++ "\" linenumbers=\"True\"\n" ++ str ++ "\n}}}\n" -- for zim's code plugin, go verbatim on the lang spec + +blockToZimWiki opts (BlockQuote blocks) = do + contents <- blockListToZimWiki opts blocks + return $ unlines $ map ("> " ++) $ lines contents + +blockToZimWiki opts (Table capt aligns _ headers rows) = do + captionDoc <- if null capt + then return "" + else do + c <- inlineListToZimWiki opts capt + return $ "" ++ c ++ "\n" + headers' <- if all null headers + then zipWithM (tableItemToZimWiki opts) aligns (rows !! 0) + else mapM (inlineListToZimWiki opts) (map removeFormatting headers) -- emphasis, links etc. are not allowed in table headers + rows' <- mapM (zipWithM (tableItemToZimWiki opts) aligns) rows + let widths = map (maximum . map length) $ transpose (headers':rows') + let padTo (width, al) s = + case (width - length s) of + x | x > 0 -> + if al == AlignLeft || al == AlignDefault + then s ++ replicate x ' ' + else if al == AlignRight + then replicate x ' ' ++ s + else replicate (x `div` 2) ' ' ++ + s ++ replicate (x - x `div` 2) ' ' + | otherwise -> s + let borderCell (width, al) _ = + if al == AlignLeft + then ":"++ replicate (width-1) '-' + else if al == AlignDefault + then replicate width '-' + else if al == AlignRight + then replicate (width-1) '-' ++ ":" + else ":" ++ replicate (width-2) '-' ++ ":" + let underheader = "|" ++ intercalate "|" (zipWith borderCell (zip widths aligns) headers') ++ "|" + let renderRow cells = "|" ++ intercalate "|" (zipWith padTo (zip widths aligns) cells) ++ "|" + return $ captionDoc ++ + (if null headers' then "" else renderRow headers' ++ "\n") ++ underheader ++ "\n" ++ + unlines (map renderRow rows') + +blockToZimWiki opts (BulletList items) = do + indent <- stIndent <$> get + modify $ \s -> s { stIndent = stIndent s ++ "\t" } + contents <- (mapM (listItemToZimWiki opts) items) + modify $ \s -> s{ stIndent = indent } -- drop 1 (stIndent s) } + return $ vcat contents ++ if null indent then "\n" else "" + +blockToZimWiki opts (OrderedList _ items) = do + indent <- stIndent <$> get + modify $ \s -> s { stIndent = stIndent s ++ "\t", stItemNum = 1 } + contents <- (mapM (orderedListItemToZimWiki opts) items) + modify $ \s -> s{ stIndent = indent } -- drop 1 (stIndent s) } + return $ vcat contents ++ if null indent then "\n" else "" + +blockToZimWiki opts (DefinitionList items) = do + contents <- (mapM (definitionListItemToZimWiki opts) items) + return $ vcat contents + +definitionListItemToZimWiki :: WriterOptions -> ([Inline],[[Block]]) -> State WriterState String +definitionListItemToZimWiki opts (label, items) = do + labelText <- inlineListToZimWiki opts label + contents <- mapM (blockListToZimWiki opts) items + indent <- stIndent <$> get + return $ indent ++ "* **" ++ labelText ++ "** " ++ concat contents + +-- Auxiliary functions for lists: +indentFromHTML :: WriterOptions -> String -> State WriterState String +indentFromHTML _ str = do + indent <- stIndent <$> get + itemnum <- stItemNum <$> get + if isInfixOf "<li>" str then return $ indent ++ show itemnum ++ "." + else if isInfixOf "</li>" str then return "\n" + else if isInfixOf "<li value=" str then do + -- poor man's cut + let val = drop 10 $ reverse $ drop 1 $ reverse str + --let val = take ((length valls) - 2) valls + modify $ \s -> s { stItemNum = read val } + return "" + else if isInfixOf "<ol>" str then do + let olcount=countSubStrs "<ol>" str + modify $ \s -> s { stIndent = stIndent s ++ replicate olcount '\t', stItemNum = 1 } + return "" + else if isInfixOf "</ol>" str then do + let olcount=countSubStrs "/<ol>" str + modify $ \s -> s{ stIndent = drop olcount (stIndent s) } + return "" + else + return "" + +countSubStrs :: String -> String -> Int +countSubStrs sub str = length $ breakOnAll (pack sub) (pack str) + +cleanupCode :: String -> String +cleanupCode = substitute "<nowiki>" "" . substitute "</nowiki>" "" + +vcat :: [String] -> String +vcat = intercalate "\n" + +-- | Convert bullet list item (list of blocks) to ZimWiki. +listItemToZimWiki :: WriterOptions -> [Block] -> State WriterState String +listItemToZimWiki opts items = do + contents <- blockListToZimWiki opts items + indent <- stIndent <$> get + return $ indent ++ "* " ++ contents + +-- | Convert ordered list item (list of blocks) to ZimWiki. +orderedListItemToZimWiki :: WriterOptions -> [Block] -> State WriterState String +orderedListItemToZimWiki opts items = do + contents <- blockListToZimWiki opts items + indent <- stIndent <$> get + itemnum <- stItemNum <$> get + --modify $ \s -> s { stItemNum = itemnum + 1 } -- this is not strictly necessary for zim as zim does its own renumbering + return $ indent ++ show itemnum ++ ". " ++ contents + +-- Auxiliary functions for tables: +tableItemToZimWiki :: WriterOptions -> Alignment -> [Block] -> State WriterState String +tableItemToZimWiki opts align' item = do + let mkcell x = (if align' == AlignRight || align' == AlignCenter + then " " + else "") ++ x ++ + (if align' == AlignLeft || align' == AlignCenter + then " " + else "") + modify $ \s -> s { stInTable = True } + contents <- blockListToZimWiki opts item + modify $ \s -> s { stInTable = False } + return $ mkcell contents + +-- | Convert list of Pandoc block elements to ZimWiki. +blockListToZimWiki :: WriterOptions -> [Block] -> State WriterState String +blockListToZimWiki opts blocks = vcat <$> mapM (blockToZimWiki opts) blocks + +-- | Convert list of Pandoc inline elements to ZimWiki. +inlineListToZimWiki :: WriterOptions -> [Inline] -> State WriterState String +inlineListToZimWiki opts lst = concat <$> (mapM (inlineToZimWiki opts) lst) + +-- | Convert Pandoc inline element to ZimWiki. +inlineToZimWiki :: WriterOptions -> Inline -> State WriterState String + +inlineToZimWiki opts (Emph lst) = do + contents <- inlineListToZimWiki opts lst + return $ "//" ++ contents ++ "//" + +inlineToZimWiki opts (Strong lst) = do + contents <- inlineListToZimWiki opts lst + return $ "**" ++ contents ++ "**" + +inlineToZimWiki opts (Strikeout lst) = do + contents <- inlineListToZimWiki opts lst + return $ "~~" ++ contents ++ "~~" + +inlineToZimWiki opts (Superscript lst) = do + contents <- inlineListToZimWiki opts lst + return $ "^{" ++ contents ++ "}" + +inlineToZimWiki opts (Subscript lst) = do + contents <- inlineListToZimWiki opts lst + return $ "_{" ++ contents ++ "}" + +inlineToZimWiki opts (Quoted SingleQuote lst) = do + contents <- inlineListToZimWiki opts lst + return $ "\8216" ++ contents ++ "\8217" + +inlineToZimWiki opts (Quoted DoubleQuote lst) = do + contents <- inlineListToZimWiki opts lst + return $ "\8220" ++ contents ++ "\8221" + +inlineToZimWiki opts (Span _attrs ils) = inlineListToZimWiki opts ils + +inlineToZimWiki opts (SmallCaps lst) = inlineListToZimWiki opts lst + +inlineToZimWiki opts (Cite _ lst) = inlineListToZimWiki opts lst + +inlineToZimWiki _ (Code _ str) = return $ "''" ++ str ++ "''" + +inlineToZimWiki _ (Str str) = do + inTable <- stInTable <$> get + inLink <- stInLink <$> get + if inTable + then return $ substitute "|" "\\|" . escapeString $ str + else + if inLink + then return $ str + else return $ escapeString str + +inlineToZimWiki _ (Math mathType str) = return $ delim ++ str ++ delim -- note: str should NOT be escaped + where delim = case mathType of + DisplayMath -> "$$" + InlineMath -> "$" + +-- | f == Format "html" = return $ "<html>" ++ str ++ "</html>" +inlineToZimWiki opts (RawInline f str) + | f == Format "zimwiki" = return str + | f == Format "html" = do cont <- indentFromHTML opts str; return cont + | otherwise = return "" + +inlineToZimWiki _ LineBreak = do + inTable <- stInTable <$> get + if inTable + then return "\\n" + else return "\n" + +inlineToZimWiki opts SoftBreak = + case writerWrapText opts of + WrapNone -> return " " + WrapAuto -> return " " + WrapPreserve -> return "\n" + +inlineToZimWiki _ Space = return " " + +inlineToZimWiki opts (Link _ txt (src, _)) = do + inTable <- stInTable <$> get + modify $ \s -> s { stInLink = True } + label <- inlineListToZimWiki opts $ removeFormatting txt -- zim does not allow formatting in link text, it takes the text verbatim, no need to escape it + modify $ \s -> s { stInLink = False } + let label'= if inTable + then "" -- no label is allowed in a table + else "|"++label + case txt of + [Str s] | "mailto:" `isPrefixOf` src -> return $ "<" ++ s ++ ">" + | escapeURI s == src -> return src + _ -> if isURI src + then return $ "[[" ++ src ++ label' ++ "]]" + else return $ "[[" ++ src' ++ label' ++ "]]" + where src' = case src of + '/':xs -> xs -- with leading / it's a + _ -> src -- link to a help page +inlineToZimWiki opts (Image attr alt (source, tit)) = do + alt' <- inlineListToZimWiki opts alt + inTable <- stInTable <$> get + let txt = case (tit, alt, inTable) of + ("",[], _) -> "" + ("", _, False ) -> "|" ++ alt' + (_ , _, False ) -> "|" ++ tit + (_ , _, True ) -> "" + -- Relative links fail isURI and receive a colon + prefix = if isURI source then "" else ":" + return $ "{{" ++ prefix ++ source ++ imageDims opts attr ++ txt ++ "}}" + +inlineToZimWiki opts (Note contents) = do + -- no concept of notes in zim wiki, use a text block + contents' <- blockListToZimWiki opts contents + return $ " **{Note:** " ++ trimr contents' ++ "**}**" + +imageDims :: WriterOptions -> Attr -> String +imageDims opts attr = go (toPx $ dimension Width attr) (toPx $ dimension Height attr) + where + toPx = fmap (showInPixel opts) . checkPct + checkPct (Just (Percent _)) = Nothing + checkPct maybeDim = maybeDim + go (Just w) Nothing = "?" ++ w + go (Just w) (Just h) = "?" ++ w ++ "x" ++ h + go Nothing (Just h) = "?0x" ++ h + go Nothing Nothing = "" diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs new file mode 100644 index 000000000..e105aee91 --- /dev/null +++ b/src/Text/Pandoc/XML.hs @@ -0,0 +1,115 @@ +{- +Copyright (C) 2006-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.XML + Copyright : Copyright (C) 2006-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Functions for escaping and formatting XML. +-} +module Text.Pandoc.XML ( escapeCharForXML, + escapeStringForXML, + inTags, + selfClosingTag, + inTagsSimple, + inTagsIndented, + toEntities, + fromEntities ) where + +import Text.Pandoc.Pretty +import Data.Char (ord, isAscii, isSpace) +import Text.HTML.TagSoup.Entity (lookupEntity) + +-- | Escape one character as needed for XML. +escapeCharForXML :: Char -> String +escapeCharForXML x = case x of + '&' -> "&" + '<' -> "<" + '>' -> ">" + '"' -> """ + c -> [c] + +-- | Escape string as needed for XML. Entity references are not preserved. +escapeStringForXML :: String -> String +escapeStringForXML = concatMap escapeCharForXML + +-- | Escape newline characters as +escapeNls :: String -> String +escapeNls (x:xs) + | x == '\n' = " " ++ escapeNls xs + | otherwise = x : escapeNls xs +escapeNls [] = [] + +-- | Return a text object with a string of formatted XML attributes. +attributeList :: [(String, String)] -> Doc +attributeList = hcat . map + (\(a, b) -> text (' ' : escapeStringForXML a ++ "=\"" ++ + escapeNls (escapeStringForXML b) ++ "\"")) + +-- | Put the supplied contents between start and end tags of tagType, +-- with specified attributes and (if specified) indentation. +inTags:: Bool -> String -> [(String, String)] -> Doc -> Doc +inTags isIndented tagType attribs contents = + let openTag = char '<' <> text tagType <> attributeList attribs <> + char '>' + closeTag = text "</" <> text tagType <> char '>' + in if isIndented + then openTag $$ nest 2 contents $$ closeTag + else openTag <> contents <> closeTag + +-- | Return a self-closing tag of tagType with specified attributes +selfClosingTag :: String -> [(String, String)] -> Doc +selfClosingTag tagType attribs = + char '<' <> text tagType <> attributeList attribs <> text " />" + +-- | Put the supplied contents between start and end tags of tagType. +inTagsSimple :: String -> Doc -> Doc +inTagsSimple tagType = inTags False tagType [] + +-- | Put the supplied contents in indented block btw start and end tags. +inTagsIndented :: String -> Doc -> Doc +inTagsIndented tagType = inTags True tagType [] + +-- | Escape all non-ascii characters using numerical entities. +toEntities :: String -> String +toEntities [] = "" +toEntities (c:cs) + | isAscii c = c : toEntities cs + | otherwise = "&#" ++ show (ord c) ++ ";" ++ toEntities cs + +-- Unescapes XML entities +fromEntities :: String -> String +fromEntities ('&':xs) = + case lookupEntity ent' of + Just c -> c ++ fromEntities rest + Nothing -> '&' : fromEntities xs + where (ent, rest) = case break (\c -> isSpace c || c == ';') xs of + (zs,';':ys) -> (zs,ys) + (zs, ys) -> (zs,ys) + ent' = case ent of + '#':'X':ys -> '#':'x':ys -- workaround tagsoup bug + '#':_ -> ent + _ -> ent ++ ";" + +fromEntities (x:xs) = x : fromEntities xs +fromEntities [] = [] |