diff options
Diffstat (limited to 'src/Text')
198 files changed, 11653 insertions, 8041 deletions
diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index 549aeddfb..f09dfd8c7 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -3,7 +3,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs index e6d5c93d4..98b072ffb 100644 --- a/src/Text/Pandoc/App.hs +++ b/src/Text/Pandoc/App.hs @@ -1,9 +1,10 @@ +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE CPP #-} {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.App - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley@edu> @@ -19,6 +20,7 @@ module Text.Pandoc.App ( , Filter(..) , defaultOpts , parseOptions + , parseOptionsFromArgs , options , applyFilters ) where @@ -27,6 +29,7 @@ import Control.Monad ( (>=>), when ) import Control.Monad.Trans ( MonadIO(..) ) import Control.Monad.Except (throwError) import qualified Data.ByteString as BS +import qualified Data.ByteString.Char8 as B8 import qualified Data.ByteString.Lazy as BL import Data.Char (toLower) import Data.Maybe (fromMaybe, isJust, isNothing) @@ -44,19 +47,21 @@ import System.FilePath ( takeBaseName, takeExtension ) import System.IO (nativeNewline, stdout) import qualified System.IO as IO (Newline (..)) import Text.Pandoc +import Text.Pandoc.Builder (setMeta) +import Text.Pandoc.MIME (getCharset) import Text.Pandoc.App.FormatHeuristics (formatFromFilePaths) import Text.Pandoc.App.Opt (Opt (..), LineEnding (..), defaultOpts, - IpynbOutput (..) ) -import Text.Pandoc.App.CommandLineOptions (parseOptions, options) + IpynbOutput (..)) +import Text.Pandoc.App.CommandLineOptions (parseOptions, parseOptionsFromArgs, + options) import Text.Pandoc.App.OutputSettings (OutputSettings (..), optToOutputSettings) -import Text.Pandoc.BCP47 (Lang (..), parseBCP47) -import Text.Pandoc.Builder (setMeta) +import Text.Collate.Lang (Lang (..), parseLang) import Text.Pandoc.Filter (Filter (JSONFilter, LuaFilter), applyFilters) import Text.Pandoc.PDF (makePDF) -import Text.Pandoc.SelfContained (makeDataURI, makeSelfContained) +import Text.Pandoc.SelfContained (makeSelfContained) import Text.Pandoc.Shared (eastAsianLineBreakFilter, stripEmptyParagraphs, headerShift, isURI, tabFilter, uriPathToPath, filterIpynbOutput, - defaultUserDataDirs, tshow, findM) + defaultUserDataDir, tshow) import Text.Pandoc.Writers.Shared (lookupMetaString) import Text.Pandoc.Readers.Markdown (yamlToMeta) import qualified Text.Pandoc.UTF8 as UTF8 @@ -67,25 +72,29 @@ import System.Posix.Terminal (queryTerminal) convertWithOpts :: Opt -> IO () convertWithOpts opts = do + datadir <- case optDataDir opts of + Nothing -> do + d <- defaultUserDataDir + exists <- doesDirectoryExist d + return $ if exists + then Just d + else Nothing + Just _ -> return $ optDataDir opts + let outputFile = fromMaybe "-" (optOutputFile opts) let filters = optFilters opts let verbosity = optVerbosity opts when (optDumpArgs opts) $ - do UTF8.hPutStrLn stdout outputFile - mapM_ (UTF8.hPutStrLn stdout) (fromMaybe ["-"] $ optInputFiles opts) + do UTF8.hPutStrLn stdout (T.pack outputFile) + mapM_ (UTF8.hPutStrLn stdout . T.pack) + (fromMaybe ["-"] $ optInputFiles opts) exitSuccess let sources = case optInputFiles opts of Just xs | not (optIgnoreArgs opts) -> xs _ -> ["-"] - datadir <- case optDataDir opts of - Nothing -> do - ds <- defaultUserDataDirs - findM doesDirectoryExist ds - Just _ -> return $ optDataDir opts - let runIO' :: PandocIO a -> IO a runIO' f = do (res, reports) <- runIOorExplode $ do @@ -151,9 +160,11 @@ convertWithOpts opts = do else optTabStop opts) - let readSources :: [FilePath] -> PandocIO Text - readSources srcs = convertTabs . T.intercalate (T.pack "\n") <$> - mapM readSource srcs + let readSources :: [FilePath] -> PandocIO [(FilePath, Text)] + readSources srcs = + mapM (\fp -> do + t <- readSource fp + return (if fp == "-" then "" else fp, convertTabs t)) srcs outputSettings <- optToOutputSettings opts @@ -190,20 +201,9 @@ convertWithOpts opts = do Nothing -> readDataFile "abbreviations" Just f -> readFileStrict f - metadata <- if format == "jats" && - isNothing (lookupMeta "csl" (optMetadata opts)) && - isNothing (lookupMeta "citation-style" - (optMetadata opts)) - then do - jatsCSL <- readDataFile "jats.csl" - let jatsEncoded = makeDataURI - ("application/xml", jatsCSL) - return $ setMeta "csl" jatsEncoded $ optMetadata opts - else return $ optMetadata opts - case lookupMetaString "lang" (optMetadata opts) of - "" -> setTranslations $ Lang "en" "" "US" [] - l -> case parseBCP47 l of + "" -> setTranslations $ Lang "en" Nothing (Just "US") [] [] [] + l -> case parseLang l of Left _ -> report $ InvalidLang l Right l' -> setTranslations l' @@ -257,13 +257,17 @@ convertWithOpts opts = do let sourceToDoc :: [FilePath] -> PandocIO Pandoc sourceToDoc sources' = case reader of - TextReader r - | optFileScope opts || readerNameBase == "json" -> - mconcat <$> mapM (readSource >=> r readerOpts) sources' - | otherwise -> - readSources sources' >>= r readerOpts - ByteStringReader r -> - mconcat <$> mapM (readFile' >=> r readerOpts) sources' + TextReader r + | readerNameBase == "json" -> + mconcat <$> mapM (readSource >=> r readerOpts) sources' + | optFileScope opts -> + -- Read source and convert tabs (see #6709) + let readSource' = fmap convertTabs . readSource + in mconcat <$> mapM (readSource' >=> r readerOpts) sources' + | otherwise -> + readSources sources' >>= r readerOpts + ByteStringReader r -> + mconcat <$> mapM (readFile' >=> r readerOpts) sources' when (readerNameBase == "markdown_github" || @@ -281,12 +285,21 @@ convertWithOpts opts = do report $ Deprecated "pandoc-citeproc filter" "Use --citeproc instead." + let cslMetadata = + maybe id (setMeta "csl") (optCSL opts) . + (case optBibliography opts of + [] -> id + xs -> setMeta "bibliography" xs) . + maybe id (setMeta "citation-abbreviations") + (optCitationAbbreviations opts) $ mempty + doc <- sourceToDoc sources >>= ( (if isJust (optExtractMedia opts) then fillMediaBag else return) >=> return . adjustMetadata (metadataFromFile <>) - >=> return . adjustMetadata (<> metadata) + >=> return . adjustMetadata (<> optMetadata opts) + >=> return . adjustMetadata (<> cslMetadata) >=> applyTransforms transforms >=> applyFilters readerOpts filters [T.unpack format] >=> maybe return extractMedia (optExtractMedia opts) @@ -353,7 +366,18 @@ readSource src = case parseURI src of _ -> PandocAppError (tshow e)) readURI :: FilePath -> PandocIO Text -readURI src = UTF8.toText . fst <$> openURL (T.pack src) +readURI src = do + (bs, mt) <- openURL (T.pack src) + case mt >>= getCharset of + Just "UTF-8" -> return $ UTF8.toText bs + Just "ISO-8859-1" -> return $ T.pack $ B8.unpack bs + Just charset -> throwError $ PandocUnsupportedCharsetError charset + Nothing -> liftIO $ -- try first as UTF-8, then as latin1 + E.catch (return $! UTF8.toText bs) + (\case + TSE.DecodeError{} -> + return $ T.pack $ B8.unpack bs + e -> E.throwIO e) readFile' :: MonadIO m => FilePath -> m BL.ByteString readFile' "-" = liftIO BL.getContents @@ -364,6 +388,5 @@ writeFnBinary "-" = liftIO . BL.putStr writeFnBinary f = liftIO . BL.writeFile (UTF8.encodePath f) writerFn :: MonadIO m => IO.Newline -> FilePath -> Text -> m () --- TODO this implementation isn't maximally efficient: -writerFn eol "-" = liftIO . UTF8.putStrWith eol . T.unpack -writerFn eol f = liftIO . UTF8.writeFileWith eol f . T.unpack +writerFn eol "-" = liftIO . UTF8.putStrWith eol +writerFn eol f = liftIO . UTF8.writeFileWith eol f diff --git a/src/Text/Pandoc/App/CommandLineOptions.hs b/src/Text/Pandoc/App/CommandLineOptions.hs index 906fcc4c0..a6df12715 100644 --- a/src/Text/Pandoc/App/CommandLineOptions.hs +++ b/src/Text/Pandoc/App/CommandLineOptions.hs @@ -6,7 +6,7 @@ {-# LANGUAGE FlexibleContexts #-} {- | Module : Text.Pandoc.App.CommandLineOptions - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley@edu> @@ -17,6 +17,7 @@ Does a pandoc conversion based on command-line options. -} module Text.Pandoc.App.CommandLineOptions ( parseOptions + , parseOptionsFromArgs , options , engines , lookupHighlightStyle @@ -25,11 +26,12 @@ module Text.Pandoc.App.CommandLineOptions ( import Control.Monad import Control.Monad.Trans import Control.Monad.Except (throwError) +import Control.Monad.State.Strict import Data.Aeson.Encode.Pretty (encodePretty', Config(..), keyOrder, defConfig, Indent(..), NumberFormat(..)) import Data.Bifunctor (second) import Data.Char (toLower) -import Data.List (intercalate, sort) +import Data.List (intercalate, sort, foldl') #ifdef _WINDOWS #if MIN_VERSION_base(4,12,0) import Data.List (isPrefixOf) @@ -46,10 +48,13 @@ import System.FilePath import System.IO (stdout) import Text.DocTemplates (Context (..), ToContext (toVal), Val (..)) import Text.Pandoc -import Text.Pandoc.App.Opt (Opt (..), LineEnding (..), IpynbOutput (..), addMeta) +import Text.Pandoc.Builder (setMeta) +import Text.Pandoc.App.Opt (Opt (..), LineEnding (..), IpynbOutput (..), + DefaultsState (..), applyDefaults, + fullDefaultsPath) import Text.Pandoc.Filter (Filter (..)) import Text.Pandoc.Highlighting (highlightingStyles) -import Text.Pandoc.Shared (ordNub, elemText, safeStrRead, defaultUserDataDirs, findM) +import Text.Pandoc.Shared (ordNub, elemText, safeStrRead, defaultUserDataDir) import Text.Printf #ifdef EMBED_DATA_FILES @@ -64,16 +69,19 @@ import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as B import qualified Data.Map as M import qualified Data.Text as T -import qualified Data.YAML as Y import qualified Text.Pandoc.UTF8 as UTF8 parseOptions :: [OptDescr (Opt -> IO Opt)] -> Opt -> IO Opt parseOptions options' defaults = do rawArgs <- map UTF8.decodeArg <$> getArgs prg <- getProgName + parseOptionsFromArgs options' defaults prg rawArgs +parseOptionsFromArgs + :: [OptDescr (Opt -> IO Opt)] -> Opt -> String -> [String] -> IO Opt +parseOptionsFromArgs options' defaults prg rawArgs = do let (actions, args, unrecognizedOpts, errors) = - getOpt' Permute options' rawArgs + getOpt' Permute options' (map UTF8.decodeArg rawArgs) let unknownOptionErrors = foldr (handleUnrecognizedOption . takeWhile (/= '=')) [] @@ -85,7 +93,7 @@ parseOptions options' defaults = do ("Try " ++ prg ++ " --help for more information.") -- thread option data structure through all supplied option actions - opts <- foldl (>>=) (return defaults) actions + opts <- foldl' (>>=) (return defaults) actions let mbArgs = case args of [] -> Nothing xs -> Just xs @@ -166,7 +174,11 @@ options = , Option "d" ["defaults"] (ReqArg - (\arg opt -> applyDefaults opt arg + (\arg opt -> runIOorExplode $ do + let defsState = DefaultsState { curDefaults = Nothing, + inheritanceGraph = [] } + fp <- fullDefaultsPath (optDataDir opt) arg + evalStateT (applyDefaults opt fp) defsState ) "FILE") "" @@ -276,7 +288,8 @@ options = , Option "" ["resource-path"] (ReqArg (\arg opt -> return opt { optResourcePath = - splitSearchPath arg }) + splitSearchPath arg ++ + optResourcePath opt }) "SEARCHPATH") "" -- "Paths to search for images and other resources" @@ -801,10 +814,10 @@ options = map (\c -> ['-',c]) shorts ++ map ("--" ++) longs let allopts = unwords (concatMap optnames options) - UTF8.hPutStrLn stdout $ printf tpl allopts - (unwords readersNames) - (unwords writersNames) - (unwords $ map (T.unpack . fst) highlightingStyles) + UTF8.hPutStrLn stdout $ T.pack $ printf tpl allopts + (T.unpack $ T.unwords readersNames) + (T.unpack $ T.unwords writersNames) + (T.unpack $ T.unwords $ map fst highlightingStyles) (unwords datafiles) exitSuccess )) "" -- "Print bash completion script" @@ -843,7 +856,7 @@ options = else if extensionEnabled x allExts then '-' else ' ') : drop 4 (show x) - mapM_ (UTF8.hPutStrLn stdout . showExt) + mapM_ (UTF8.hPutStrLn stdout . T.pack . showExt) [ex | ex <- extList, extensionEnabled ex allExts] exitSuccess ) "FORMAT") @@ -857,14 +870,14 @@ options = , sShortname s `notElem` [T.pack "Alert", T.pack "Alert_indent"] ] - mapM_ (UTF8.hPutStrLn stdout) (sort langs) + mapM_ (UTF8.hPutStrLn stdout . T.pack) (sort langs) exitSuccess )) "" , Option "" ["list-highlight-styles"] (NoArg (\_ -> do - mapM_ (UTF8.hPutStrLn stdout . T.unpack . fst) highlightingStyles + mapM_ (UTF8.hPutStrLn stdout . fst) highlightingStyles exitSuccess )) "" @@ -882,7 +895,7 @@ options = | T.null t -> -- e.g. for docx, odt, json: E.throwIO $ PandocCouldNotFindDataFileError $ T.pack ("templates/default." ++ arg) - | otherwise -> write . T.unpack $ t + | otherwise -> write t Left e -> E.throwIO e exitSuccess) "FORMAT") @@ -928,12 +941,13 @@ options = (NoArg (\_ -> do prg <- getProgName - defaultDatadirs <- defaultUserDataDirs - UTF8.hPutStrLn stdout (prg ++ " " ++ T.unpack pandocVersion ++ - compileInfo ++ - "\nUser data directory: " ++ - intercalate " or " defaultDatadirs ++ - ('\n':copyrightMessage)) + defaultDatadir <- defaultUserDataDir + UTF8.hPutStrLn stdout + $ T.pack + $ prg ++ " " ++ T.unpack pandocVersion ++ + compileInfo ++ + "\nUser data directory: " ++ defaultDatadir ++ + ('\n':copyrightMessage) exitSuccess )) "" -- "Print version" @@ -941,7 +955,7 @@ options = (NoArg (\_ -> do prg <- getProgName - UTF8.hPutStr stdout (usageMessage prg options) + UTF8.hPutStr stdout (T.pack $ usageMessage prg options) exitSuccess )) "" -- "Show help" ] @@ -962,7 +976,7 @@ usageMessage programName = usageInfo (programName ++ " [OPTIONS] [FILES]") copyrightMessage :: String copyrightMessage = intercalate "\n" [ - "Copyright (C) 2006-2020 John MacFarlane. Web: https://pandoc.org", + "Copyright (C) 2006-2021 John MacFarlane. Web: https://pandoc.org", "This is free software; see the source for copying conditions. There is no", "warranty, not even for merchantability or fitness for a particular purpose." ] @@ -1002,38 +1016,16 @@ handleUnrecognizedOption "-R" = handleUnrecognizedOption "--parse-raw" handleUnrecognizedOption x = (("Unknown option " ++ x ++ ".") :) -readersNames :: [String] -readersNames = sort (map (T.unpack . fst) (readers :: [(Text, Reader PandocIO)])) +readersNames :: [Text] +readersNames = sort (map fst (readers :: [(Text, Reader PandocIO)])) -writersNames :: [String] +writersNames :: [Text] writersNames = sort - ("pdf" : map (T.unpack . fst) (writers :: [(Text, Writer PandocIO)])) + ("pdf" : map fst (writers :: [(Text, Writer PandocIO)])) splitField :: String -> (String, String) splitField = second (tailDef "true") . break (`elemText` ":=") --- | Apply defaults from --defaults file. -applyDefaults :: Opt -> FilePath -> IO Opt -applyDefaults opt file = runIOorExplode $ do - let fp = if null (takeExtension file) - then addExtension file "yaml" - else file - setVerbosity $ optVerbosity opt - dataDirs <- liftIO defaultUserDataDirs - let fps = fp : case optDataDir opt of - Nothing -> map (</> ("defaults" </> fp)) - dataDirs - Just dd -> [dd </> "defaults" </> fp] - fp' <- fromMaybe fp <$> findM fileExists fps - inp <- readFileLazy fp' - case Y.decode1 inp of - Right (f :: Opt -> Opt) -> return $ f opt - Left (errpos, errmsg) -> throwError $ - PandocParseError $ T.pack $ - "Error parsing " ++ fp' ++ " line " ++ - show (Y.posLine errpos) ++ " column " ++ - show (Y.posColumn errpos) ++ ":\n" ++ errmsg - lookupHighlightStyle :: PandocMonad m => String -> m Style lookupHighlightStyle s | takeExtension s == ".theme" = -- attempt to load KDE theme @@ -1062,6 +1054,27 @@ setVariable key val (Context ctx) = Context $ M.alter go key ctx go (Just (ListVal xs)) = Just $ ListVal $ xs ++ [toVal val] go (Just x) = Just $ ListVal [x, toVal val] +addMeta :: String -> String -> Meta -> Meta +addMeta k v meta = + case lookupMeta k' meta of + Nothing -> setMeta k' v' meta + Just (MetaList xs) -> + setMeta k' (MetaList (xs ++ [v'])) meta + Just x -> setMeta k' (MetaList [x, v']) meta + where + v' = readMetaValue v + k' = T.pack k + +readMetaValue :: String -> MetaValue +readMetaValue s + | s == "true" = MetaBool True + | s == "True" = MetaBool True + | s == "TRUE" = MetaBool True + | s == "false" = MetaBool False + | s == "False" = MetaBool False + | s == "FALSE" = MetaBool False + | otherwise = MetaString $ T.pack s + -- On Windows with ghc 8.6+, we need to rewrite paths -- beginning with \\ to \\?\UNC\. -- See #5127. normalizePath :: FilePath -> FilePath diff --git a/src/Text/Pandoc/App/FormatHeuristics.hs b/src/Text/Pandoc/App/FormatHeuristics.hs index 155b7e586..bdf8c6667 100644 --- a/src/Text/Pandoc/App/FormatHeuristics.hs +++ b/src/Text/Pandoc/App/FormatHeuristics.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.App.FormatHeuristics - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley@edu> @@ -15,18 +15,24 @@ module Text.Pandoc.App.FormatHeuristics ) where import Data.Char (toLower) +import Data.Foldable (asum) import Data.Text (Text) import System.FilePath (takeExtension) --- Determine default format based on file extensions. +-- | Determines default format based on file extensions; uses the format +-- of the first extension that's associated with a format. +-- +-- Examples: +-- +-- > formatFromFilePaths ["text.unknown", "no-extension"] +-- Nothing +-- +-- > formatFromFilePaths ["my.md", "other.rst"] +-- Just "markdown" formatFromFilePaths :: [FilePath] -> Maybe Text -formatFromFilePaths [] = Nothing -formatFromFilePaths (x:xs) = - case formatFromFilePath x of - Just f -> Just f - Nothing -> formatFromFilePaths xs +formatFromFilePaths = asum . map formatFromFilePath --- Determine format based on file extension +-- | Determines format based on file extension. formatFromFilePath :: FilePath -> Maybe Text formatFromFilePath x = case takeExtension (map toLower x) of @@ -48,6 +54,11 @@ formatFromFilePath x = ".lhs" -> Just "markdown+lhs" ".ltx" -> Just "latex" ".markdown" -> Just "markdown" + ".mkdn" -> Just "markdown" + ".mkd" -> Just "markdown" + ".mdwn" -> Just "markdown" + ".mdown" -> Just "markdown" + ".Rmd" -> Just "markdown" ".md" -> Just "markdown" ".ms" -> Just "ms" ".muse" -> Just "muse" diff --git a/src/Text/Pandoc/App/Opt.hs b/src/Text/Pandoc/App/Opt.hs index 00b4b5523..d54d932b7 100644 --- a/src/Text/Pandoc/App/Opt.hs +++ b/src/Text/Pandoc/App/Opt.hs @@ -7,7 +7,7 @@ {-# LANGUAGE FlexibleContexts #-} {- | Module : Text.Pandoc.App.Opt - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley@edu> @@ -20,21 +20,31 @@ module Text.Pandoc.App.Opt ( Opt(..) , LineEnding (..) , IpynbOutput (..) + , DefaultsState (..) , defaultOpts - , addMeta + , applyDefaults + , fullDefaultsPath ) where +import Control.Monad.Except (MonadIO, liftIO, throwError, (>=>), foldM) +import Control.Monad.State.Strict (StateT, modify, gets) +import System.FilePath ( addExtension, (</>), takeExtension, takeDirectory ) +import System.Directory ( canonicalizePath ) import Data.Char (isLower, toLower) +import Data.Maybe (fromMaybe) import GHC.Generics hiding (Meta) -import Text.Pandoc.Builder (setMeta) import Text.Pandoc.Filter (Filter (..)) -import Text.Pandoc.Logging (Verbosity (WARNING)) +import Text.Pandoc.Logging (Verbosity (WARNING), LogMessage(..)) import Text.Pandoc.Options (TopLevelDivision (TopLevelDefault), TrackChanges (AcceptChanges), WrapOption (WrapAuto), HTMLMathMethod (PlainMath), ReferenceLocation (EndOfDocument), ObfuscationMethod (NoObfuscation), CiteMethod (Citeproc)) -import Text.Pandoc.Shared (camelCaseStrToHyphenated) +import Text.Pandoc.Class (readFileLazy, fileExists, setVerbosity, report, + PandocMonad(lookupEnv), getUserDataDir) +import Text.Pandoc.Error (PandocError (PandocParseError, PandocSomeError)) +import Text.Pandoc.Shared (camelCaseStrToHyphenated, defaultUserDataDir, + findM, ordNub) import qualified Text.Pandoc.Parsing as P import Text.Pandoc.Readers.Metadata (yamlMap) import Text.Pandoc.Class.PandocPure @@ -43,7 +53,7 @@ import Data.Text (Text, unpack) import Data.Default (def) import qualified Data.Text as T import qualified Data.Map as M -import Text.Pandoc.Definition (Meta(..), MetaValue(..), lookupMeta) +import Text.Pandoc.Definition (Meta(..), MetaValue(..)) import Data.Aeson (defaultOptions, Options(..)) import Data.Aeson.TH (deriveJSON) import Control.Applicative ((<|>)) @@ -147,19 +157,194 @@ data Opt = Opt , optNoCheckCertificate :: Bool -- ^ Disable certificate validation , optEol :: LineEnding -- ^ Style of line-endings to use , optStripComments :: Bool -- ^ Skip HTML comments + , optCSL :: Maybe FilePath -- ^ CSL stylesheet + , optBibliography :: [FilePath] -- ^ Bibliography files + , optCitationAbbreviations :: Maybe FilePath -- ^ Citation abbreviations } deriving (Generic, Show) instance FromYAML (Opt -> Opt) where - parseYAML (Mapping _ _ m) = - foldr (.) id <$> mapM doOpt (M.toList m) + parseYAML (Mapping _ _ m) = chain doOpt (M.toList m) parseYAML n = failAtNode n "Expected a mapping" +data DefaultsState = DefaultsState + { + curDefaults :: Maybe FilePath -- currently parsed file + , inheritanceGraph :: [[FilePath]] -- defaults file inheritance graph + } deriving (Show) + +instance (PandocMonad m, MonadIO m) + => FromYAML (Opt -> StateT DefaultsState m Opt) where + parseYAML (Mapping _ _ m) = do + let opts = M.mapKeys toText m + dataDir <- case M.lookup "data-dir" opts of + Nothing -> return Nothing + Just v -> Just . unpack <$> parseYAML v + f <- parseOptions (M.toList m) + case M.lookup "defaults" opts of + Just v -> do + g <- parseDefaults v dataDir + return $ g >=> f >=> resolveVarsInOpt + Nothing -> return $ f >=> resolveVarsInOpt + where + toText (Scalar _ (SStr s)) = s + toText _ = "" + parseYAML n = failAtNode n "Expected a mapping" + +resolveVarsInOpt :: forall m. (PandocMonad m, MonadIO m) + => Opt -> StateT DefaultsState m Opt +resolveVarsInOpt + opt@Opt + { optTemplate = oTemplate + , optMetadataFiles = oMetadataFiles + , optOutputFile = oOutputFile + , optInputFiles = oInputFiles + , optSyntaxDefinitions = oSyntaxDefinitions + , optAbbreviations = oAbbreviations + , optReferenceDoc = oReferenceDoc + , optEpubMetadata = oEpubMetadata + , optEpubFonts = oEpubFonts + , optEpubCoverImage = oEpubCoverImage + , optLogFile = oLogFile + , optFilters = oFilters + , optDataDir = oDataDir + , optExtractMedia = oExtractMedia + , optCss = oCss + , optIncludeBeforeBody = oIncludeBeforeBody + , optIncludeAfterBody = oIncludeAfterBody + , optIncludeInHeader = oIncludeInHeader + , optResourcePath = oResourcePath + , optCSL = oCSL + , optBibliography = oBibliography + , optCitationAbbreviations = oCitationAbbreviations + } + = do + oTemplate' <- mapM resolveVars oTemplate + oMetadataFiles' <- mapM resolveVars oMetadataFiles + oOutputFile' <- mapM resolveVars oOutputFile + oInputFiles' <- mapM (mapM resolveVars) oInputFiles + oSyntaxDefinitions' <- mapM resolveVars oSyntaxDefinitions + oAbbreviations' <- mapM resolveVars oAbbreviations + oReferenceDoc' <- mapM resolveVars oReferenceDoc + oEpubMetadata' <- mapM resolveVars oEpubMetadata + oEpubFonts' <- mapM resolveVars oEpubFonts + oEpubCoverImage' <- mapM resolveVars oEpubCoverImage + oLogFile' <- mapM resolveVars oLogFile + oFilters' <- mapM resolveVarsInFilter oFilters + oDataDir' <- mapM resolveVars oDataDir + oExtractMedia' <- mapM resolveVars oExtractMedia + oCss' <- mapM resolveVars oCss + oIncludeBeforeBody' <- mapM resolveVars oIncludeBeforeBody + oIncludeAfterBody' <- mapM resolveVars oIncludeAfterBody + oIncludeInHeader' <- mapM resolveVars oIncludeInHeader + oResourcePath' <- mapM resolveVars oResourcePath + oCSL' <- mapM resolveVars oCSL + oBibliography' <- mapM resolveVars oBibliography + oCitationAbbreviations' <- mapM resolveVars oCitationAbbreviations + return opt{ optTemplate = oTemplate' + , optMetadataFiles = oMetadataFiles' + , optOutputFile = oOutputFile' + , optInputFiles = oInputFiles' + , optSyntaxDefinitions = oSyntaxDefinitions' + , optAbbreviations = oAbbreviations' + , optReferenceDoc = oReferenceDoc' + , optEpubMetadata = oEpubMetadata' + , optEpubFonts = oEpubFonts' + , optEpubCoverImage = oEpubCoverImage' + , optLogFile = oLogFile' + , optFilters = oFilters' + , optDataDir = oDataDir' + , optExtractMedia = oExtractMedia' + , optCss = oCss' + , optIncludeBeforeBody = oIncludeBeforeBody' + , optIncludeAfterBody = oIncludeAfterBody' + , optIncludeInHeader = oIncludeInHeader' + , optResourcePath = oResourcePath' + , optCSL = oCSL' + , optBibliography = oBibliography' + , optCitationAbbreviations = oCitationAbbreviations' + } + + where + resolveVars :: FilePath -> StateT DefaultsState m FilePath + resolveVars [] = return [] + resolveVars ('$':'{':xs) = + let (ys, zs) = break (=='}') xs + in if null zs + then return $ '$':'{':xs + else do + val <- lookupEnv' ys + (val ++) <$> resolveVars (drop 1 zs) + resolveVars (c:cs) = (c:) <$> resolveVars cs + lookupEnv' :: String -> StateT DefaultsState m String + lookupEnv' "." = do + mbCurDefaults <- gets curDefaults + maybe (return "") + (fmap takeDirectory . liftIO . canonicalizePath) + mbCurDefaults + lookupEnv' "USERDATA" = do + mbodatadir <- mapM resolveVars oDataDir + mbdatadir <- getUserDataDir + defdatadir <- liftIO defaultUserDataDir + return $ fromMaybe defdatadir (mbodatadir <|> mbdatadir) + lookupEnv' v = do + mbval <- fmap T.unpack <$> lookupEnv (T.pack v) + case mbval of + Nothing -> do + report $ EnvironmentVariableUndefined (T.pack v) + return mempty + Just x -> return x + resolveVarsInFilter (JSONFilter fp) = + JSONFilter <$> resolveVars fp + resolveVarsInFilter (LuaFilter fp) = + LuaFilter <$> resolveVars fp + resolveVarsInFilter CiteprocFilter = return CiteprocFilter + + + +parseDefaults :: (PandocMonad m, MonadIO m) + => Node Pos + -> Maybe FilePath + -> Parser (Opt -> StateT DefaultsState m Opt) +parseDefaults n dataDir = parseDefsNames n >>= \ds -> return $ \o -> do + -- get parent defaults: + defsParent <- gets $ fromMaybe "" . curDefaults + -- get child defaults: + defsChildren <- mapM (fullDefaultsPath dataDir) ds + -- expand parent in defaults inheritance graph by children: + defsGraph <- gets inheritanceGraph + let defsGraphExp = expand defsGraph defsChildren defsParent + modify $ \defsState -> defsState{ inheritanceGraph = defsGraphExp } + -- check for cyclic inheritance: + if cyclic defsGraphExp + then throwError $ + PandocSomeError $ T.pack $ + "Error: Circular defaults file reference in " ++ + "'" ++ defsParent ++ "'" + else foldM applyDefaults o defsChildren + where parseDefsNames x = (parseYAML x >>= \xs -> return $ map unpack xs) + <|> (parseYAML x >>= \x' -> return [unpack x']) + +parseOptions :: Monad m + => [(Node Pos, Node Pos)] + -> Parser (Opt -> StateT DefaultsState m Opt) +parseOptions ns = do + f <- chain doOpt' ns + return $ return . f + +chain :: Monad m => (a -> m (b -> b)) -> [a] -> m (b -> b) +chain f = foldM g id + where g o n = f n >>= \o' -> return $ o' . o + +doOpt' :: (Node Pos, Node Pos) -> Parser (Opt -> Opt) +doOpt' (k',v) = do + k <- parseStringKey k' + case k of + "defaults" -> return id + _ -> doOpt (k',v) + doOpt :: (Node Pos, Node Pos) -> Parser (Opt -> Opt) doOpt (k',v) = do - k <- case k' of - Scalar _ (SStr t) -> return t - Scalar _ _ -> failAtNode k' "Non-string key" - _ -> failAtNode k' "Non-scalar key" + k <- parseStringKey k' case k of "tab-stop" -> parseYAML v >>= \x -> return (\o -> o{ optTabStop = x }) @@ -358,26 +543,18 @@ doOpt (k',v) = do (parseYAML v >>= \x -> return (\o -> o{ optCss = optCss o <> [unpack x] })) "bibliography" -> - do let addItem x o = o{ optMetadata = - addMeta "bibliography" (T.unpack x) - (optMetadata o) } - (parseYAML v >>= \(xs :: [Text]) -> return $ \o -> - foldr addItem o xs) - <|> - (parseYAML v >>= \(x :: Text) -> return $ \o -> addItem x o) + (parseYAML v >>= \x -> return (\o -> + o{ optBibliography = optBibliography o <> + map unpack x })) + <|> + (parseYAML v >>= \x -> return (\o -> + o{ optBibliography = optBibliography o <> + [unpack x] })) "csl" -> - do let addItem x o = o{ optMetadata = - addMeta "csl" (T.unpack x) - (optMetadata o) } - (parseYAML v >>= \(xs :: [Text]) -> return $ \o -> - foldr addItem o xs) - <|> - (parseYAML v >>= \(x :: Text) -> return $ \o -> addItem x o) + parseYAML v >>= \x -> return (\o -> o{ optCSL = unpack <$> x }) "citation-abbreviations" -> - parseYAML v >>= \x -> - return (\o -> o{ optMetadata = - addMeta "citation-abbreviations" (T.unpack x) - (optMetadata o) }) + parseYAML v >>= \x -> return (\o -> o{ optCitationAbbreviations = + unpack <$> x }) "ipynb-output" -> parseYAML v >>= \x -> return (\o -> o{ optIpynbOutput = x }) "include-before-body" -> @@ -406,7 +583,8 @@ doOpt (k',v) = do optIncludeInHeader o <> [unpack x] })) "resource-path" -> parseYAML v >>= \x -> - return (\o -> o{ optResourcePath = map unpack x }) + return (\o -> o{ optResourcePath = map unpack x <> + optResourcePath o }) "request-headers" -> parseYAML v >>= \x -> return (\o -> o{ optRequestHeaders = x }) @@ -492,38 +670,70 @@ defaultOpts = Opt , optNoCheckCertificate = False , optEol = Native , optStripComments = False + , optCSL = Nothing + , optBibliography = [] + , optCitationAbbreviations = Nothing } +parseStringKey :: Node Pos -> Parser Text +parseStringKey k = case k of + Scalar _ (SStr t) -> return t + Scalar _ _ -> failAtNode k "Non-string key" + _ -> failAtNode k "Non-scalar key" + yamlToMeta :: Node Pos -> Parser Meta yamlToMeta (Mapping _ _ m) = either (fail . show) return $ runEverything (yamlMap pMetaString m) where pMetaString = pure . MetaString <$> P.manyChar P.anyChar - runEverything p = runPure (P.readWithM p def "") + runEverything p = + runPure (P.readWithM p (def :: P.ParserState) ("" :: Text)) >>= fmap (Meta . flip P.runF def) yamlToMeta _ = return mempty -addMeta :: String -> String -> Meta -> Meta -addMeta k v meta = - case lookupMeta k' meta of - Nothing -> setMeta k' v' meta - Just (MetaList xs) -> - setMeta k' (MetaList (xs ++ [v'])) meta - Just x -> setMeta k' (MetaList [x, v']) meta - where - v' = readMetaValue v - k' = T.pack k +-- | Apply defaults from --defaults file. +applyDefaults :: (PandocMonad m, MonadIO m) + => Opt + -> FilePath + -> StateT DefaultsState m Opt +applyDefaults opt file = do + setVerbosity $ optVerbosity opt + modify $ \defsState -> defsState{ curDefaults = Just file } + inp <- readFileLazy file + case decode1 inp of + Right f -> f opt + Left (errpos, errmsg) -> throwError $ + PandocParseError $ T.pack $ + "Error parsing " ++ file ++ " line " ++ + show (posLine errpos) ++ " column " ++ + show (posColumn errpos) ++ ":\n" ++ errmsg -readMetaValue :: String -> MetaValue -readMetaValue s - | s == "true" = MetaBool True - | s == "True" = MetaBool True - | s == "TRUE" = MetaBool True - | s == "false" = MetaBool False - | s == "False" = MetaBool False - | s == "FALSE" = MetaBool False - | otherwise = MetaString $ T.pack s +fullDefaultsPath :: (PandocMonad m, MonadIO m) + => Maybe FilePath + -> FilePath + -> m FilePath +fullDefaultsPath dataDir file = do + let fp = if null (takeExtension file) + then addExtension file "yaml" + else file + defaultDataDir <- liftIO defaultUserDataDir + let defaultFp = fromMaybe defaultDataDir dataDir </> "defaults" </> fp + fromMaybe fp <$> findM fileExists [fp, defaultFp] +-- | In a list of lists, append another list in front of every list which +-- starts with specific element. +expand :: Ord a => [[a]] -> [a] -> a -> [[a]] +expand [] ns n = fmap (\x -> x : [n]) ns +expand ps ns n = concatMap (ext n ns) ps + where + ext x xs p = case p of + (l : _) | x == l -> fmap (: p) xs + _ -> [p] + +cyclic :: Ord a => [[a]] -> Bool +cyclic = any hasDuplicate + where + hasDuplicate xs = length (ordNub xs) /= length xs -- see https://github.com/jgm/pandoc/pull/4083 -- using generic deriving caused long compilation times diff --git a/src/Text/Pandoc/App/OutputSettings.hs b/src/Text/Pandoc/App/OutputSettings.hs index 139b408cb..3864ab188 100644 --- a/src/Text/Pandoc/App/OutputSettings.hs +++ b/src/Text/Pandoc/App/OutputSettings.hs @@ -5,7 +5,7 @@ {-# LANGUAGE TupleSections #-} {- | Module : Text.Pandoc.App - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley@edu> @@ -59,8 +59,8 @@ optToOutputSettings opts = do let outputFile = fromMaybe "-" (optOutputFile opts) when (optDumpArgs opts) . liftIO $ do - UTF8.hPutStrLn stdout outputFile - mapM_ (UTF8.hPutStrLn stdout) (fromMaybe [] $ optInputFiles opts) + UTF8.hPutStrLn stdout (T.pack outputFile) + mapM_ (UTF8.hPutStrLn stdout . T.pack) (fromMaybe [] $ optInputFiles opts) exitSuccess epubMetadata <- traverse readUtf8File $ optEpubMetadata opts diff --git a/src/Text/Pandoc/Asciify.hs b/src/Text/Pandoc/Asciify.hs index 9e9cc8d9b..620546c13 100644 --- a/src/Text/Pandoc/Asciify.hs +++ b/src/Text/Pandoc/Asciify.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Asciify - Copyright : Copyright (C) 2013-2020 John MacFarlane + Copyright : Copyright (C) 2013-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -10,396 +10,19 @@ Function to convert accented latin letters to their unaccented ascii equivalents (used in constructing HTML identifiers). -} -module Text.Pandoc.Asciify (toAsciiChar) +module Text.Pandoc.Asciify (toAsciiChar, toAsciiText) where -import Data.Char (isAscii) -import qualified Data.Map as M +import Data.Char (isAscii, isMark) +import qualified Data.Text.Normalize as TN +import Data.Text (Text) +import qualified Data.Text as T -toAsciiChar :: Char -> Maybe Char -toAsciiChar c | isAscii c = Just c - | otherwise = M.lookup c asciiMap +toAsciiText :: Text -> Text +toAsciiText = T.filter isAscii . TN.normalize (TN.NFD) -asciiMap :: M.Map Char Char -asciiMap = M.fromList - [('\192','A') - ,('\193','A') - ,('\194','A') - ,('\195','A') - ,('\196','A') - ,('\197','A') - ,('\199','C') - ,('\200','E') - ,('\201','E') - ,('\202','E') - ,('\203','E') - ,('\204','I') - ,('\205','I') - ,('\206','I') - ,('\207','I') - ,('\209','N') - ,('\210','O') - ,('\211','O') - ,('\212','O') - ,('\213','O') - ,('\214','O') - ,('\217','U') - ,('\218','U') - ,('\219','U') - ,('\220','U') - ,('\221','Y') - ,('\224','a') - ,('\225','a') - ,('\226','a') - ,('\227','a') - ,('\228','a') - ,('\229','a') - ,('\231','c') - ,('\232','e') - ,('\233','e') - ,('\234','e') - ,('\235','e') - ,('\236','i') - ,('\237','i') - ,('\238','i') - ,('\239','i') - ,('\241','n') - ,('\242','o') - ,('\243','o') - ,('\244','o') - ,('\245','o') - ,('\246','o') - ,('\249','u') - ,('\250','u') - ,('\251','u') - ,('\252','u') - ,('\253','y') - ,('\255','y') - ,('\256','A') - ,('\257','a') - ,('\258','A') - ,('\259','a') - ,('\260','A') - ,('\261','a') - ,('\262','C') - ,('\263','c') - ,('\264','C') - ,('\265','c') - ,('\266','C') - ,('\267','c') - ,('\268','C') - ,('\269','c') - ,('\270','D') - ,('\271','d') - ,('\274','E') - ,('\275','e') - ,('\276','E') - ,('\277','e') - ,('\278','E') - ,('\279','e') - ,('\280','E') - ,('\281','e') - ,('\282','E') - ,('\283','e') - ,('\284','G') - ,('\285','g') - ,('\286','G') - ,('\287','g') - ,('\288','G') - ,('\289','g') - ,('\290','G') - ,('\291','g') - ,('\292','H') - ,('\293','h') - ,('\296','I') - ,('\297','i') - ,('\298','I') - ,('\299','i') - ,('\300','I') - ,('\301','i') - ,('\302','I') - ,('\303','i') - ,('\304','I') - ,('\305','i') - ,('\308','J') - ,('\309','j') - ,('\310','K') - ,('\311','k') - ,('\313','L') - ,('\314','l') - ,('\315','L') - ,('\316','l') - ,('\317','L') - ,('\318','l') - ,('\323','N') - ,('\324','n') - ,('\325','N') - ,('\326','n') - ,('\327','N') - ,('\328','n') - ,('\332','O') - ,('\333','o') - ,('\334','O') - ,('\335','o') - ,('\336','O') - ,('\337','o') - ,('\340','R') - ,('\341','r') - ,('\342','R') - ,('\343','r') - ,('\344','R') - ,('\345','r') - ,('\346','S') - ,('\347','s') - ,('\348','S') - ,('\349','s') - ,('\350','S') - ,('\351','s') - ,('\352','S') - ,('\353','s') - ,('\354','T') - ,('\355','t') - ,('\356','T') - ,('\357','t') - ,('\360','U') - ,('\361','u') - ,('\362','U') - ,('\363','u') - ,('\364','U') - ,('\365','u') - ,('\366','U') - ,('\367','u') - ,('\368','U') - ,('\369','u') - ,('\370','U') - ,('\371','u') - ,('\372','W') - ,('\373','w') - ,('\374','Y') - ,('\375','y') - ,('\376','Y') - ,('\377','Z') - ,('\378','z') - ,('\379','Z') - ,('\380','z') - ,('\381','Z') - ,('\382','z') - ,('\416','O') - ,('\417','o') - ,('\431','U') - ,('\432','u') - ,('\461','A') - ,('\462','a') - ,('\463','I') - ,('\464','i') - ,('\465','O') - ,('\466','o') - ,('\467','U') - ,('\468','u') - ,('\486','G') - ,('\487','g') - ,('\488','K') - ,('\489','k') - ,('\490','O') - ,('\491','o') - ,('\496','j') - ,('\500','G') - ,('\501','g') - ,('\504','N') - ,('\505','n') - ,('\512','A') - ,('\513','a') - ,('\514','A') - ,('\515','a') - ,('\516','E') - ,('\517','e') - ,('\518','E') - ,('\519','e') - ,('\520','I') - ,('\521','i') - ,('\522','I') - ,('\523','i') - ,('\524','O') - ,('\525','o') - ,('\526','O') - ,('\527','o') - ,('\528','R') - ,('\529','r') - ,('\530','R') - ,('\531','r') - ,('\532','U') - ,('\533','u') - ,('\534','U') - ,('\535','u') - ,('\536','S') - ,('\537','s') - ,('\538','T') - ,('\539','t') - ,('\542','H') - ,('\543','h') - ,('\550','A') - ,('\551','a') - ,('\552','E') - ,('\553','e') - ,('\558','O') - ,('\559','o') - ,('\562','Y') - ,('\563','y') - ,('\894',';') - ,('\7680','A') - ,('\7681','a') - ,('\7682','B') - ,('\7683','b') - ,('\7684','B') - ,('\7685','b') - ,('\7686','B') - ,('\7687','b') - ,('\7690','D') - ,('\7691','d') - ,('\7692','D') - ,('\7693','d') - ,('\7694','D') - ,('\7695','d') - ,('\7696','D') - ,('\7697','d') - ,('\7698','D') - ,('\7699','d') - ,('\7704','E') - ,('\7705','e') - ,('\7706','E') - ,('\7707','e') - ,('\7710','F') - ,('\7711','f') - ,('\7712','G') - ,('\7713','g') - ,('\7714','H') - ,('\7715','h') - ,('\7716','H') - ,('\7717','h') - ,('\7718','H') - ,('\7719','h') - ,('\7720','H') - ,('\7721','h') - ,('\7722','H') - ,('\7723','h') - ,('\7724','I') - ,('\7725','i') - ,('\7728','K') - ,('\7729','k') - ,('\7730','K') - ,('\7731','k') - ,('\7732','K') - ,('\7733','k') - ,('\7734','L') - ,('\7735','l') - ,('\7738','L') - ,('\7739','l') - ,('\7740','L') - ,('\7741','l') - ,('\7742','M') - ,('\7743','m') - ,('\7744','M') - ,('\7745','m') - ,('\7746','M') - ,('\7747','m') - ,('\7748','N') - ,('\7749','n') - ,('\7750','N') - ,('\7751','n') - ,('\7752','N') - ,('\7753','n') - ,('\7754','N') - ,('\7755','n') - ,('\7764','P') - ,('\7765','p') - ,('\7766','P') - ,('\7767','p') - ,('\7768','R') - ,('\7769','r') - ,('\7770','R') - ,('\7771','r') - ,('\7774','R') - ,('\7775','r') - ,('\7776','S') - ,('\7777','s') - ,('\7778','S') - ,('\7779','s') - ,('\7786','T') - ,('\7787','t') - ,('\7788','T') - ,('\7789','t') - ,('\7790','T') - ,('\7791','t') - ,('\7792','T') - ,('\7793','t') - ,('\7794','U') - ,('\7795','u') - ,('\7796','U') - ,('\7797','u') - ,('\7798','U') - ,('\7799','u') - ,('\7804','V') - ,('\7805','v') - ,('\7806','V') - ,('\7807','v') - ,('\7808','W') - ,('\7809','w') - ,('\7810','W') - ,('\7811','w') - ,('\7812','W') - ,('\7813','w') - ,('\7814','W') - ,('\7815','w') - ,('\7816','W') - ,('\7817','w') - ,('\7818','X') - ,('\7819','x') - ,('\7820','X') - ,('\7821','x') - ,('\7822','Y') - ,('\7823','y') - ,('\7824','Z') - ,('\7825','z') - ,('\7826','Z') - ,('\7827','z') - ,('\7828','Z') - ,('\7829','z') - ,('\7830','h') - ,('\7831','t') - ,('\7832','w') - ,('\7833','y') - ,('\7840','A') - ,('\7841','a') - ,('\7842','A') - ,('\7843','a') - ,('\7864','E') - ,('\7865','e') - ,('\7866','E') - ,('\7867','e') - ,('\7868','E') - ,('\7869','e') - ,('\7880','I') - ,('\7881','i') - ,('\7882','I') - ,('\7883','i') - ,('\7884','O') - ,('\7885','o') - ,('\7886','O') - ,('\7887','o') - ,('\7908','U') - ,('\7909','u') - ,('\7910','U') - ,('\7911','u') - ,('\7922','Y') - ,('\7923','y') - ,('\7924','Y') - ,('\7925','y') - ,('\7926','Y') - ,('\7927','y') - ,('\7928','Y') - ,('\7929','y') - ,('\8175','`') - ,('\8490','K') - ,('\8800','=') - ,('\8814','<') - ,('\8815','>') - ] +toAsciiChar :: Char -> Maybe Char +toAsciiChar c = case T.unpack (TN.normalize TN.NFD (T.singleton c)) of + (x:xs) | isAscii x + , all isMark xs + -> Just x + _ -> Nothing diff --git a/src/Text/Pandoc/BCP47.hs b/src/Text/Pandoc/BCP47.hs deleted file mode 100644 index b41e93125..000000000 --- a/src/Text/Pandoc/BCP47.hs +++ /dev/null @@ -1,112 +0,0 @@ -{-# LANGUAGE OverloadedStrings #-} -{- | - Module : Text.Pandoc.BCP47 - Copyright : Copyright (C) 2017–2020 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Functions for parsing and rendering BCP47 language identifiers. --} -module Text.Pandoc.BCP47 ( - getLang - , parseBCP47 - , Lang(..) - , renderLang - ) -where -import Control.Monad (guard) -import Data.Char (isAlphaNum, isAscii, isLetter, isLower, isUpper) -import Text.Pandoc.Definition -import Text.Pandoc.Options -import Text.DocTemplates (FromContext(..)) -import qualified Data.Text as T -import qualified Text.Parsec as P - --- | Represents BCP 47 language/country code. -data Lang = Lang{ langLanguage :: T.Text - , langScript :: T.Text - , langRegion :: T.Text - , langVariants :: [T.Text] } - deriving (Eq, Ord, Show) - --- | Render a Lang as BCP 47. -renderLang :: Lang -> T.Text -renderLang lang = T.intercalate "-" (langLanguage lang : filter (not . T.null) - ([langScript lang, langRegion lang] ++ langVariants lang)) - --- | Get the contents of the `lang` metadata field or variable. -getLang :: WriterOptions -> Meta -> Maybe T.Text -getLang opts meta = - case lookupContext "lang" (writerVariables opts) of - Just s -> Just s - _ -> - case lookupMeta "lang" meta of - Just (MetaBlocks [Para [Str s]]) -> Just s - Just (MetaBlocks [Plain [Str s]]) -> Just s - Just (MetaInlines [Str s]) -> Just s - Just (MetaString s) -> Just s - _ -> Nothing - --- | Parse a BCP 47 string as a Lang. Currently we parse --- extensions and private-use fields as "variants," even --- though officially they aren't. -parseBCP47 :: T.Text -> Either T.Text Lang -parseBCP47 lang = - case P.parse bcp47 "lang" lang of - Right r -> Right r - Left e -> Left $ T.pack $ show e - where bcp47 = do - language <- pLanguage - script <- P.option "" pScript - region <- P.option "" pRegion - variants <- P.many (pVariant P.<|> pExtension P.<|> pPrivateUse) - P.eof - return Lang{ langLanguage = language - , langScript = script - , langRegion = region - , langVariants = variants } - asciiLetter = P.satisfy (\c -> isAscii c && isLetter c) - pLanguage = do - cs <- P.many1 asciiLetter - let lcs = length cs - guard $ lcs == 2 || lcs == 3 - return $ T.toLower $ T.pack cs - pScript = P.try $ do - P.char '-' - x <- P.satisfy (\c -> isAscii c && isLetter c && isUpper c) - xs <- P.count 3 - (P.satisfy (\c -> isAscii c && isLetter c && isLower c)) - return $ T.toLower $ T.pack (x:xs) - pRegion = P.try $ do - P.char '-' - cs <- P.many1 asciiLetter - let lcs = length cs - guard $ lcs == 2 || lcs == 3 - return $ T.toUpper $ T.pack cs - pVariant = P.try $ do - P.char '-' - ds <- P.option "" (P.count 1 P.digit) - cs <- P.many1 asciiLetter - let var = ds ++ cs - lv = length var - guard $ if null ds - then lv >= 5 && lv <= 8 - else lv == 4 - return $ T.toLower $ T.pack var - pExtension = P.try $ do - P.char '-' - cs <- P.many1 $ P.satisfy (\c -> isAscii c && isAlphaNum c) - let lcs = length cs - guard $ lcs >= 2 && lcs <= 8 - return $ T.toLower $ T.pack cs - pPrivateUse = P.try $ do - P.char '-' - P.char 'x' - P.char '-' - cs <- P.many1 $ P.satisfy (\c -> isAscii c && isAlphaNum c) - guard $ not (null cs) && length cs <= 8 - let var = "x-" ++ cs - return $ T.toLower $ T.pack var diff --git a/src/Text/Pandoc/CSS.hs b/src/Text/Pandoc/CSS.hs index d98c85147..625feadbb 100644 --- a/src/Text/Pandoc/CSS.hs +++ b/src/Text/Pandoc/CSS.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.CSS -Copyright : © 2006-2020 John MacFarlane <jgm@berkeley.edu>, +Copyright : © 2006-2021 John MacFarlane <jgm@berkeley.edu>, 2015-2016 Mauro Bieg, 2015 Ophir Lifshitz <hangfromthefloor@gmail.com> License : GNU GPL, version 2 or above diff --git a/src/Text/Pandoc/CSV.hs b/src/Text/Pandoc/CSV.hs index 10812644f..2bd21bcfb 100644 --- a/src/Text/Pandoc/CSV.hs +++ b/src/Text/Pandoc/CSV.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.CSV - Copyright : Copyright (C) 2017–2020 John MacFarlane <jgm@berkeley.edu> + Copyright : Copyright (C) 2017-2021 John MacFarlane <jgm@berkeley.edu> License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> Stability : alpha @@ -68,8 +68,7 @@ escaped opts = try $ pCSVUnquotedCell :: CSVOptions -> Parser Text pCSVUnquotedCell opts = T.pack <$> - many (satisfy (\c -> c /= csvDelim opts && c /= '\r' && c /= '\n' - && c /= csvQuote opts)) + many (satisfy (\c -> c /= csvDelim opts && c /= '\r' && c /= '\n')) pCSVDelim :: CSVOptions -> Parser () pCSVDelim opts = do diff --git a/src/Text/Pandoc/Citeproc.hs b/src/Text/Pandoc/Citeproc.hs index a48f97c3b..246f54516 100644 --- a/src/Text/Pandoc/Citeproc.hs +++ b/src/Text/Pandoc/Citeproc.hs @@ -5,7 +5,10 @@ {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} module Text.Pandoc.Citeproc - ( processCitations ) + ( processCitations, + getReferences, + getStyle + ) where import Citeproc @@ -15,38 +18,104 @@ import Text.Pandoc.Citeproc.CslJson (cslJsonToReferences) import Text.Pandoc.Citeproc.BibTeX (readBibtexString, Variant(..)) import Text.Pandoc.Citeproc.MetaValue (metaValueToReference, metaValueToText) import Text.Pandoc.Readers.Markdown (yamlToRefs) -import Text.Pandoc.Class (setResourcePath, getResourcePath, getUserDataDir) -import Data.ByteString (ByteString) -import qualified Data.ByteString.Lazy as L +import Text.Pandoc.Builder (Inlines, Many(..), deleteMeta, setMeta) +import qualified Text.Pandoc.Builder as B import Text.Pandoc.Definition as Pandoc -import Text.Pandoc.Walk -import Text.Pandoc.Builder as B -import Text.Pandoc (PandocMonad(..), PandocError(..), - readDataFile, ReaderOptions(..), pandocExtensions, - report, LogMessage(..), fetchItem) -import Text.Pandoc.Shared (stringify, ordNub, blocksToInlines, tshow) +import Text.Pandoc.Class (PandocMonad(..), getResourcePath, getUserDataDir, + fetchItem, readDataFile, report, setResourcePath) +import Text.Pandoc.Error (PandocError(..)) +import Text.Pandoc.Extensions (pandocExtensions) +import Text.Pandoc.Logging (LogMessage(..)) +import Text.Pandoc.Options (ReaderOptions(..)) +import Text.Pandoc.Shared (stringify, ordNub, tshow) import qualified Text.Pandoc.UTF8 as UTF8 +import Text.Pandoc.Walk (query, walk, walkM) +import Control.Applicative ((<|>)) +import Control.Monad.Except (catchError, throwError) +import Control.Monad.State (State, evalState, get, put, runState) import Data.Aeson (eitherDecode) -import Data.Default -import Data.Ord () +import Data.ByteString (ByteString) +import qualified Data.ByteString.Lazy as L +import Data.Char (isPunctuation, isUpper) +import Data.Default (Default(def)) +import qualified Data.Foldable as Foldable import qualified Data.Map as M +import Data.Maybe (mapMaybe, fromMaybe) +import Data.Ord () +import qualified Data.Sequence as Seq import qualified Data.Set as Set -import Data.Char (isPunctuation, isUpper) import Data.Text (Text) import qualified Data.Text as T -import Control.Monad.State -import qualified Data.Sequence as Seq -import qualified Data.Foldable as Foldable -import System.FilePath -import Control.Applicative -import Control.Monad.Except -import Data.Maybe (mapMaybe, fromMaybe) +import System.FilePath (takeExtension) import Safe (lastMay, initSafe) --- import Debug.Trace as Trace (trace, traceShowId) -processCitations :: PandocMonad m => Pandoc -> m Pandoc +processCitations :: PandocMonad m => Pandoc -> m Pandoc processCitations (Pandoc meta bs) = do + style <- getStyle (Pandoc meta bs) + + mblang <- getLang meta + let locale = Citeproc.mergeLocales mblang style + + refs <- getReferences (Just locale) (Pandoc meta bs) + + let otherIdsMap = foldr (\ref m -> + case T.words . extractText <$> + M.lookup "other-ids" + (referenceVariables ref) of + Nothing -> m + Just ids -> foldr + (\id' -> + M.insert id' (referenceId ref)) m ids) + M.empty refs + let meta' = deleteMeta "nocite" meta + let citations = getCitations locale otherIdsMap $ Pandoc meta' bs + + + let linkCites = maybe False truish $ lookupMeta "link-citations" meta + let opts = defaultCiteprocOptions{ linkCitations = linkCites } + let result = Citeproc.citeproc opts style mblang refs citations + mapM_ (report . CiteprocWarning) (resultWarnings result) + let sopts = styleOptions style + let classes = "references" : -- TODO remove this or keep for compatibility? + "csl-bib-body" : + ["hanging-indent" | styleHangingIndent sopts] + let refkvs = (case styleEntrySpacing sopts of + Just es | es > 0 -> (("entry-spacing",T.pack $ show es):) + _ -> id) . + (case styleLineSpacing sopts of + Just ls | ls > 1 -> (("line-spacing",T.pack $ show ls):) + _ -> id) $ [] + let bibs = mconcat $ map (\(ident, out) -> + B.divWith ("ref-" <> ident,["csl-entry"],[]) . B.para . + walk (convertQuotes locale) . + insertSpace $ out) + (resultBibliography result) + let moveNotes = styleIsNoteStyle sopts && + maybe True truish (lookupMeta "notes-after-punctuation" meta) + let cits = map (walk (convertQuotes locale)) $ + resultCitations result + + let metanocites = lookupMeta "nocite" meta + let Pandoc meta'' bs' = + maybe id (setMeta "nocite") metanocites . + walk (mvPunct moveNotes locale) . + (if styleIsNoteStyle sopts + then walk addNote . walk deNote + else id) . + evalState (walkM insertResolvedCitations $ Pandoc meta' bs) + $ cits + return $ Pandoc meta'' + $ insertRefs refkvs classes meta'' + (walk fixLinks $ B.toList bibs) bs' + +-- | Retrieve the CSL style specified by the csl or citation-style +-- metadata field in a pandoc document, or the default CSL style +-- if none is specified. Retrieve the parent style +-- if the style is a dependent style. Add abbreviations defined +-- in an abbreviation file if one has been specified. +getStyle :: PandocMonad m => Pandoc -> m (Style Inlines) +getStyle (Pandoc meta _) = do let cslfile = (lookupMeta "csl" meta <|> lookupMeta "citation-style" meta) >>= metaValueToText @@ -87,86 +156,60 @@ processCitations (Pandoc meta bs) = do catchError (getFile ".csl" basename) (\_ -> fst <$> fetchItem url) styleRes <- Citeproc.parseStyle getParentStyle cslContents - style <- - case styleRes of - Left err -> throwError $ PandocAppError $ prettyCiteprocError err - Right style -> return style{ styleAbbreviations = mbAbbrevs } - let mblang = parseLang <$> - ((lookupMeta "lang" meta <|> lookupMeta "locale" meta) >>= metaValueToText) - let locale = Citeproc.mergeLocales mblang style + case styleRes of + Left err -> throwError $ PandocAppError $ prettyCiteprocError err + Right style -> return style{ styleAbbreviations = mbAbbrevs } + + +-- Retrieve citeproc lang based on metadata. +getLang :: PandocMonad m => Meta -> m (Maybe Lang) +getLang meta = maybe (return Nothing) bcp47LangToIETF + ((lookupMeta "lang" meta <|> lookupMeta "locale" meta) >>= + metaValueToText) + +-- | Get references defined inline in the metadata and via an external +-- bibliography. Only references that are actually cited in the document +-- (either with a genuine citation or with `nocite`) are returned. +-- URL variables are converted to links. +getReferences :: PandocMonad m + => Maybe Locale -> Pandoc -> m [Reference Inlines] +getReferences mblocale (Pandoc meta bs) = do + locale <- case mblocale of + Just l -> return l + Nothing -> do + mblang <- getLang meta + case mblang of + Just lang -> return $ either mempty id $ getLocale lang + Nothing -> return mempty + let getCiteId (Cite cs _) = Set.fromList $ map B.citationId cs getCiteId _ = mempty let metanocites = lookupMeta "nocite" meta - let meta' = deleteMeta "nocite" meta let nocites = maybe mempty (query getCiteId) metanocites let citeIds = query getCiteId (Pandoc meta bs) let idpred = if "*" `Set.member` nocites then const True else (`Set.member` citeIds) - refs <- map (linkifyVariables . legacyDateRanges) <$> - case lookupMeta "references" meta of - Just (MetaList rs) -> return $ mapMaybe metaValueToReference rs - _ -> - case lookupMeta "bibliography" meta of - Just (MetaList xs) -> - mconcat <$> - mapM (getRefsFromBib locale idpred) - (mapMaybe metaValueToText xs) - Just x -> - case metaValueToText x of - Just fp -> getRefsFromBib locale idpred fp - Nothing -> return [] - Nothing -> return [] - let otherIdsMap = foldr (\ref m -> - case T.words . extractText <$> - M.lookup "other-ids" - (referenceVariables ref) of - Nothing -> m - Just ids -> foldr - (\id' -> - M.insert id' (referenceId ref)) m ids) - M.empty refs - -- TODO: issue warning if no refs defined - let citations = getCitations locale otherIdsMap $ Pandoc meta' bs - let linkCites = maybe False truish $ lookupMeta "link-citations" meta - let opts = defaultCiteprocOptions{ linkCitations = linkCites } - let result = Citeproc.citeproc opts style (localeLanguage locale) - refs citations - mapM_ (report . CiteprocWarning) (resultWarnings result) - let sopts = styleOptions style - let classes = "references" : -- TODO remove this or keep for compatibility? - "csl-bib-body" : - ["hanging-indent" | styleHangingIndent sopts] - let refkvs = (case styleEntrySpacing sopts of - Just es | es > 0 -> (("entry-spacing",T.pack $ show es):) - _ -> id) . - (case styleLineSpacing sopts of - Just ls | ls > 1 -> (("line-spacing",T.pack $ show ls):) - _ -> id) $ [] - let bibs = mconcat $ map (\(ident, out) -> - B.divWith ("ref-" <> ident,["csl-entry"],[]) . B.para . - walk (convertQuotes locale) . insertSpace $ out) - (resultBibliography result) - let moveNotes = maybe True truish $ - lookupMeta "notes-after-punctuation" meta - let cits = map (walk fixLinks . walk (convertQuotes locale)) $ - resultCitations result + let inlineRefs = case lookupMeta "references" meta of + Just (MetaList rs) -> + filter (idpred . unItemId . referenceId) + $ mapMaybe metaValueToReference rs + _ -> [] + externalRefs <- case lookupMeta "bibliography" meta of + Just (MetaList xs) -> + mconcat <$> + mapM (getRefsFromBib locale idpred) + (mapMaybe metaValueToText xs) + Just x -> + case metaValueToText x of + Just fp -> getRefsFromBib locale idpred fp + Nothing -> return [] + Nothing -> return [] + return $ map (linkifyVariables . legacyDateRanges) + (externalRefs ++ inlineRefs) + -- note that inlineRefs can override externalRefs - let fixQuotes = case localePunctuationInQuote locale of - Just True -> - B.toList . movePunctuationInsideQuotes . B.fromList - _ -> id - let Pandoc meta'' bs' = - maybe id (setMeta "nocite") metanocites . - walk (map capitalizeNoteCitation . - fixQuotes . mvPunct moveNotes locale) . - walk deNote . - evalState (walkM insertResolvedCitations $ Pandoc meta' bs) - $ cits - return $ Pandoc meta'' - $ insertRefs refkvs classes meta'' - (walk fixLinks $ B.toList bibs) bs' -- If we have a span.csl-left-margin followed by span.csl-right-inline, -- we insert a space. This ensures that they will be separated by a space, @@ -187,24 +230,23 @@ insertSpace ils = getRefsFromBib :: PandocMonad m => Locale -> (Text -> Bool) -> Text -> m [Reference Inlines] -getRefsFromBib locale idpred t = do - let fp = T.unpack t - raw <- readFileStrict fp - case formatFromExtension fp of +getRefsFromBib locale idpred fp = do + (raw, _) <- fetchItem fp + case formatFromExtension (T.unpack fp) of Just f -> getRefs locale f idpred (Just fp) raw Nothing -> throwError $ PandocAppError $ - "Could not determine bibliography format for " <> t + "Could not determine bibliography format for " <> fp getRefs :: PandocMonad m => Locale -> BibFormat -> (Text -> Bool) - -> Maybe FilePath + -> Maybe Text -> ByteString -> m [Reference Inlines] getRefs locale format idpred mbfp raw = do let err' = throwError . - PandocBibliographyError (maybe mempty T.pack mbfp) + PandocBibliographyError (fromMaybe mempty mbfp) case format of Format_bibtex -> either (err' . tshow) return . @@ -219,7 +261,7 @@ getRefs locale format idpred mbfp raw = do Format_yaml -> do rs <- yamlToRefs idpred def{ readerExtensions = pandocExtensions } - mbfp + (T.unpack <$> mbfp) (L.fromStrict raw) return $ mapMaybe metaValueToReference rs @@ -248,7 +290,7 @@ insertResolvedCitations (Cite cs ils) = do [] -> return (Cite cs ils) (x:xs) -> do put xs - return $ Cite cs (B.toList x) + return $ Cite cs (walk fixLinks $ B.toList x) insertResolvedCitations x = return x getCitations :: Locale @@ -330,7 +372,6 @@ formatFromExtension fp = case dropWhile (== '.') $ takeExtension fp of isNote :: Inline -> Bool -isNote (Note _) = True isNote (Cite _ [Note _]) = True -- the following allows citation styles that are "in-text" but use superscript -- references to be treated as if they are "notes" for the purposes of moving @@ -343,6 +384,12 @@ isSpacy Space = True isSpacy SoftBreak = True isSpacy _ = False +movePunctInsideQuotes :: Locale -> [Inline] -> [Inline] +movePunctInsideQuotes locale + | localePunctuationInQuote locale == Just True + = B.toList . movePunctuationInsideQuotes . B.fromList + | otherwise + = id mvPunct :: Bool -> Locale -> [Inline] -> [Inline] mvPunct moveNotes locale (x : xs) @@ -355,7 +402,8 @@ mvPunct moveNotes locale (q : s : x : ys) in if moveNotes then if T.null spunct then q : x : mvPunct moveNotes locale ys - else q : Str spunct : x : mvPunct moveNotes locale + else movePunctInsideQuotes locale + [q , Str spunct , x] ++ mvPunct moveNotes locale (B.toList (dropTextWhile isPunctuation (B.fromList ys))) else q : x : mvPunct moveNotes locale ys @@ -367,9 +415,10 @@ mvPunct moveNotes locale (Cite cs ils : ys) , moveNotes = let s = stringify ys spunct = T.takeWhile isPunctuation s - in Cite cs (init ils - ++ [Str spunct | not (endWithPunct False (init ils))] - ++ [last ils]) : + in Cite cs (movePunctInsideQuotes locale $ + init ils + ++ [Str spunct | not (endWithPunct False (init ils))] + ++ [last ils]) : mvPunct moveNotes locale (B.toList (dropTextWhile isPunctuation (B.fromList ys))) mvPunct moveNotes locale (s : x : ys) | isSpacy s, isNote x = @@ -385,7 +434,7 @@ mvPunct _ _ [] = [] -- move https://doi.org etc. prefix inside link text (#6723): fixLinks :: [Inline] -> [Inline] fixLinks (Str t : Link attr [Str u1] (u2,tit) : xs) - | t <> u1 == u2 + | u2 == t <> u1 = Link attr [Str (t <> u1)] (u2,tit) : fixLinks xs fixLinks (x:xs) = x : fixLinks xs fixLinks [] = [] @@ -455,7 +504,8 @@ insertRefs refkvs refclasses meta refs bs = put True -- refHeader isn't used if you have an explicit references div let cs' = ordNub $ cs ++ refclasses - return $ Div ("refs",cs',kvs) (xs ++ refs) + let kvs' = ordNub $ kvs ++ refkvs + return $ Div ("refs",cs',kvs') (xs ++ refs) go x = return x refTitle :: Meta -> Maybe [Inline] @@ -498,13 +548,15 @@ linkifyVariables ref = fixShortDOI x = let x' = extractText x in if "10/" `T.isPrefixOf` x' then TextVal $ T.drop 3 x' - -- see http://shortdoi.org + -- see https://shortdoi.org else TextVal x' tolink pref x = let x' = extractText x x'' = if "://" `T.isInfixOf` x' then x' else pref <> x' - in FancyVal (B.link x'' "" (B.str x')) + in if T.null x' + then x + else FancyVal (B.link x'' "" (B.str x')) extractText :: Val Inlines -> Text extractText (TextVal x) = x @@ -512,42 +564,62 @@ extractText (FancyVal x) = toText x extractText (NumVal n) = T.pack (show n) extractText _ = mempty -capitalizeNoteCitation :: Inline -> Inline -capitalizeNoteCitation (Cite cs [Note [Para ils]]) = - Cite cs - [Note [Para $ B.toList $ addTextCase Nothing CapitalizeFirst - $ B.fromList ils]] -capitalizeNoteCitation x = x - -deNote :: [Inline] -> [Inline] -deNote [] = [] -deNote (Note bs:rest) = - Note (walk go bs) : deNote rest +-- Here we take the Spans with class csl-note that are left +-- after deNote has removed nested ones, and convert them +-- into real notes. +addNote :: Inline -> Inline +addNote (Span ("",["csl-note"],[]) ils) = + Note [Para $ + B.toList . addTextCase Nothing CapitalizeFirst . B.fromList $ ils] +addNote x = x + +-- Here we handle citation notes that occur inside footnotes +-- or other citation notes, in a note style. We don't want +-- notes inside notes, so we convert these to parenthesized +-- or comma-separated citations. +deNote :: Inline -> Inline +deNote (Note bs) = + case bs of + [Para (cit@(Cite (c:_) _) : ils)] + | citationMode c /= AuthorInText -> + -- if citation is first in note, no need to parenthesize. + Note [Para (walk removeNotes $ cit : walk addParens ils)] + _ -> Note (walk removeNotes . walk addParens $ bs) + where - go [] = [] - go (Cite (c:cs) ils : zs) + addParens [] = [] + addParens (Cite (c:cs) ils : zs) | citationMode c == AuthorInText - = Cite (c:cs) (concatMap (noteAfterComma (needsPeriod zs)) ils) : go zs + = Cite (c:cs) (concatMap (noteAfterComma (needsPeriod zs)) ils) : + addParens zs | otherwise - = Cite (c:cs) (concatMap noteInParens ils) : go zs - go (x:xs) = x : go xs + = Cite (c:cs) (concatMap noteInParens ils) : addParens zs + addParens (x:xs) = x : addParens xs + + removeNotes (Span ("",["csl-note"],[]) ils) = Span ("",[],[]) ils + removeNotes x = x + needsPeriod [] = True needsPeriod (Str t:_) = case T.uncons t of Nothing -> False Just (c,_) -> isUpper c needsPeriod (Space:zs) = needsPeriod zs needsPeriod _ = False - noteInParens (Note bs') + + noteInParens (Span ("",["csl-note"],[]) ils) = Space : Str "(" : - removeFinalPeriod (blocksToInlines bs') ++ [Str ")"] + removeFinalPeriod ils ++ [Str ")"] noteInParens x = [x] - noteAfterComma needsPer (Note bs') + + noteAfterComma needsPer (Span ("",["csl-note"],[]) ils) + | not (null ils) = Str "," : Space : - (if needsPer - then id - else removeFinalPeriod) (blocksToInlines bs') + if needsPer + then ils + else removeFinalPeriod ils noteAfterComma _ x = [x] -deNote (x:xs) = x : deNote xs + +deNote x = x -- Note: we can't use dropTextWhileEnd indiscriminately, -- because this would remove the final period on abbreviations like Ibid. @@ -579,3 +651,11 @@ removeFinalPeriod ils = isRightQuote "\8217" = True isRightQuote "\187" = True isRightQuote _ = False + +bcp47LangToIETF :: PandocMonad m => Text -> m (Maybe Lang) +bcp47LangToIETF bcplang = + case parseLang bcplang of + Left _ -> do + report $ InvalidLang bcplang + return Nothing + Right lang -> return $ Just lang diff --git a/src/Text/Pandoc/Citeproc/BibTeX.hs b/src/Text/Pandoc/Citeproc/BibTeX.hs index 552339df0..c178de6e9 100644 --- a/src/Text/Pandoc/Citeproc/BibTeX.hs +++ b/src/Text/Pandoc/Citeproc/BibTeX.hs @@ -1,4 +1,5 @@ {-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE FlexibleContexts #-} @@ -17,6 +18,7 @@ module Text.Pandoc.Citeproc.BibTeX ( Variant(..) , readBibtexString + , writeBibtexString ) where @@ -24,10 +26,11 @@ import Text.Pandoc.Definition import Text.Pandoc.Builder as B import Text.Pandoc.Readers.LaTeX (readLaTeX) import Text.Pandoc.Extensions (Extension(..), extensionsFromList) -import Text.Pandoc.Options (ReaderOptions(..)) -import Text.Pandoc.Class (runPure) +import Text.Pandoc.Options (ReaderOptions(..), WriterOptions) import Text.Pandoc.Error (PandocError) import Text.Pandoc.Shared (stringify) +import Text.Pandoc.Writers.LaTeX (writeLaTeX) +import Text.Pandoc.Class (runPure) import qualified Text.Pandoc.Walk as Walk import Citeproc.Types import Citeproc.Pandoc () @@ -46,17 +49,21 @@ import qualified Data.Sequence as Seq import Data.Char (isAlphaNum, isDigit, isLetter, isUpper, toLower, toUpper, isLower, isPunctuation) -import Data.List (foldl', intercalate) +import Data.List (foldl', intercalate, intersperse) import Safe (readMay) +import Text.Printf (printf) +import Text.DocLayout (literal, hsep, nest, hang, Doc(..), + braces, ($$), cr) data Variant = Bibtex | Biblatex deriving (Show, Eq, Ord) -- | Parse BibTeX or BibLaTeX into a list of 'Reference's. -readBibtexString :: Variant -- ^ bibtex or biblatex +readBibtexString :: ToSources a + => Variant -- ^ bibtex or biblatex -> Locale -- ^ Locale -> (Text -> Bool) -- ^ Filter on citation ids - -> Text -- ^ bibtex/biblatex text + -> a -- ^ bibtex/biblatex text -> Either ParseError [Reference Inlines] readBibtexString variant locale idpred contents = do case runParser (((resolveCrossRefs variant <$> bibEntries) <* eof) >>= @@ -64,17 +71,280 @@ readBibtexString variant locale idpred contents = do filter (\item -> idpred (identifier item) && entryType item /= "xdata")) (fromMaybe defaultLang $ localeLanguage locale, Map.empty) - "" contents of + (initialSourceName sources) sources of Left err -> Left err Right xs -> return xs + where + sources = toSources contents + +-- | Write BibTeX or BibLaTeX given given a 'Reference'. +writeBibtexString :: WriterOptions -- ^ options (for writing LaTex) + -> Variant -- ^ bibtex or biblatex + -> Maybe Lang -- ^ Language + -> Reference Inlines -- ^ Reference to write + -> Doc Text +writeBibtexString opts variant mblang ref = + "@" <> bibtexType <> "{" <> literal (unItemId (referenceId ref)) <> "," + $$ nest 2 (renderFields fs) + $$ "}" <> cr + + where + bibtexType = + case referenceType ref of + "article-magazine" -> "article" + "article-newspaper" -> "article" + "article-journal" -> "article" + "book" -> "book" + "pamphlet" -> "booklet" + "dataset" | variant == Biblatex -> "dataset" + "webpage" | variant == Biblatex -> "online" + "chapter" -> case getVariable "editor" of + Just _ -> "incollection" + Nothing -> "inbook" + "entry-encyclopedia" | variant == Biblatex -> "inreference" + | otherwise -> "inbook" + "paper-conference" -> "inproceedings" + "thesis" -> case getVariableAsText "genre" of + Just "mathesis" -> "mastersthesis" + _ -> "phdthesis" + "patent" | variant == Biblatex -> "patent" + "report" | variant == Biblatex -> "report" + | otherwise -> "techreport" + "speech" -> "unpublished" + "manuscript" -> "unpublished" + "graphic" | variant == Biblatex -> "artwork" + "song" | variant == Biblatex -> "music" + "legal_case" | variant == Biblatex -> "jurisdictionN" + "legislation" | variant == Biblatex -> "legislation" + "treaty" | variant == Biblatex -> "legal" + "personal_communication" | variant == Biblatex -> "letter" + "motion_picture" | variant == Biblatex -> "movie" + "review" | variant == Biblatex -> "review" + _ -> "misc" + + mbSubtype = + case referenceType ref of + "article-magazine" -> Just "magazine" + "article-newspaper" -> Just "newspaper" + _ -> Nothing + + fs = + case variant of + Biblatex -> + [ "author" + , "editor" + , "translator" + , "publisher" + , "title" + , "booktitle" + , "journal" + , "series" + , "edition" + , "volume" + , "volumes" + , "number" + , "pages" + , "date" + , "eventdate" + , "urldate" + , "address" + , "url" + , "doi" + , "isbn" + , "issn" + , "type" + , "entrysubtype" + , "note" + , "langid" + , "abstract" + , "keywords" + , "annote" + ] + Bibtex -> + [ "author" + , "editor" + , "translator" + , "publisher" + , "title" + , "booktitle" + , "journal" + , "series" + , "edition" + , "volume" + , "number" + , "pages" + , "year" + , "month" + , "address" + , "type" + , "note" + , "annote" + ] + + valToInlines (TextVal t) = B.text t + valToInlines (FancyVal ils) = ils + valToInlines (NumVal n) = B.text (T.pack $ show n) + valToInlines (NamesVal names) = + mconcat $ intersperse (B.space <> B.text "and" <> B.space) + $ map renderName names + valToInlines (DateVal date) = B.text $ + case dateLiteral date of + Just t -> t + Nothing -> T.intercalate "/" (map renderDatePart (dateParts date)) <> + (if dateCirca date then "~" else mempty) + + renderDatePart (DateParts xs) = T.intercalate "-" $ + map (T.pack . printf "%02d") xs + + renderName name = + case nameLiteral name of + Just t -> B.text t + Nothing -> spacedMaybes + [ nameNonDroppingParticle name + , nameFamily name + , if nameCommaSuffix name + then (", " <>) <$> nameSuffix name + else nameSuffix name ] + <> + spacedMaybes + [ (", " <>) <$> nameGiven name, + nameDroppingParticle name ] + + mblang' = case getVariableAsText "language" of + Just l -> either (const Nothing) Just $ parseLang l + Nothing -> mblang + + titlecase = case mblang' of + Just lang | langLanguage lang == "en" + -> titlecase' + Nothing -> titlecase' + _ -> + case variant of + Bibtex -> B.spanWith nullAttr + -- BibTex lacks a language field, so we wrap non-English + -- titles in {} to protect case. + Biblatex -> id + + titlecase' = addTextCase mblang' TitleCase . + (\ils -> B.fromList + (case B.toList ils of + Str t : xs -> Str t : Walk.walk spanAroundCapitalizedWords xs + xs -> Walk.walk spanAroundCapitalizedWords xs)) + + -- protect capitalized words when we titlecase + spanAroundCapitalizedWords (Str t) + | not (T.all (\c -> isLower c || not (isLetter c)) t) = + Span ("",["nocase"],[]) [Str t] + spanAroundCapitalizedWords x = x + + spacedMaybes = mconcat . intersperse B.space . mapMaybe (fmap B.text) + + toLaTeX x = + case runPure (writeLaTeX opts $ doc (B.plain x)) of + Left _ -> Nothing + Right t -> Just $ hsep . map literal $ T.words t + + renderField :: Text -> Maybe (Doc Text) + renderField name = + (((literal name) <>) . hang 2 " = " . braces) + <$> getContentsFor name + + getVariable v = lookupVariable (toVariable v) ref + + getVariableAsText v = (stringify . valToInlines) <$> getVariable v + + getYear val = + case val of + DateVal date -> + case dateLiteral date of + Just t -> toLaTeX (B.text t) + Nothing -> + case dateParts date of + [DateParts (y1:_), DateParts (y2:_)] -> + Just $ literal (T.pack (printf "%04d" y1) <> "--" <> + T.pack (printf "%04d" y2)) + [DateParts (y1:_)] -> + Just $ literal (T.pack (printf "%04d" y1)) + _ -> Nothing + _ -> Nothing + + toMonth 1 = "jan" + toMonth 2 = "feb" + toMonth 3 = "mar" + toMonth 4 = "apr" + toMonth 5 = "may" + toMonth 6 = "jun" + toMonth 7 = "jul" + toMonth 8 = "aug" + toMonth 9 = "sep" + toMonth 10 = "oct" + toMonth 11 = "nov" + toMonth 12 = "dec" + toMonth x = T.pack $ show x + + getMonth val = + case val of + DateVal date -> + case dateParts date of + [DateParts (_:m1:_), DateParts (_:m2:_)] -> + Just $ literal (toMonth m1 <> "--" <> toMonth m2) + [DateParts (_:m1:_)] -> Just $ literal (toMonth m1) + _ -> Nothing + _ -> Nothing + + getContentsFor :: Text -> Maybe (Doc Text) + getContentsFor "type" = + getVariableAsText "genre" >>= + \case + "mathesis" -> Just "mastersthesis" + "phdthesis" -> Just "phdthesis" + _ -> Nothing + getContentsFor "entrysubtype" = literal <$> mbSubtype + getContentsFor "journal" + | bibtexType `elem` ["article", "periodical", "suppperiodical", "review"] + = getVariable "container-title" >>= toLaTeX . valToInlines + | otherwise = Nothing + getContentsFor "booktitle" + | bibtexType `elem` + ["inbook","incollection","inproceedings","inreference","bookinbook"] + = (getVariable "volume-title" <|> getVariable "container-title") + >>= toLaTeX . valToInlines + | otherwise = Nothing + getContentsFor "series" = getVariable "collection-title" + >>= toLaTeX . valToInlines + getContentsFor "address" = getVariable "publisher-place" + >>= toLaTeX . valToInlines + getContentsFor "date" = getVariable "issued" >>= toLaTeX . valToInlines + getContentsFor "eventdate" = getVariable "event-date" >>= toLaTeX . valToInlines + getContentsFor "urldate" = getVariable "accessed" >>= toLaTeX . valToInlines + getContentsFor "year" = getVariable "issued" >>= getYear + getContentsFor "month" = getVariable "issued" >>= getMonth + getContentsFor "pages" = getVariable "page" >>= toLaTeX . valToInlines + getContentsFor "langid" = getVariable "language" >>= toLaTeX . valToInlines + getContentsFor "number" = (getVariable "number" + <|> getVariable "collection-number" + <|> getVariable "issue") >>= toLaTeX . valToInlines + + getContentsFor x = getVariable x >>= + if isURL x + then Just . literal . stringify . valToInlines + else toLaTeX . + (if x == "title" + then titlecase + else id) . + valToInlines + + isURL x = x `elem` ["url","doi","issn","isbn"] + + renderFields = mconcat . intersperse ("," <> cr) . mapMaybe renderField defaultLang :: Lang -defaultLang = Lang "en" (Just "US") +defaultLang = Lang "en" Nothing (Just "US") [] [] [] -- a map of bibtex "string" macros type StringMap = Map.Map Text Text -type BibParser = Parser Text (Lang, StringMap) +type BibParser = Parser Sources (Lang, StringMap) data Item = Item{ identifier :: Text , sourcePos :: SourcePos @@ -89,9 +359,7 @@ itemToReference locale variant item = do bib item $ do let lang = fromMaybe defaultLang $ localeLanguage locale modify $ \st -> st{ localeLang = lang, - untitlecase = case lang of - (Lang "en" _) -> True - _ -> False } + untitlecase = langLanguage lang == "en" } id' <- asks identifier otherIds <- (Just <$> getRawField "ids") @@ -315,10 +583,10 @@ itemToReference locale variant item = do eprint <- getRawField "eprint" let baseUrl = case T.toLower etype of - "arxiv" -> "http://arxiv.org/abs/" - "jstor" -> "http://www.jstor.org/stable/" - "pubmed" -> "http://www.ncbi.nlm.nih.gov/pubmed/" - "googlebooks" -> "http://books.google.com?id=" + "arxiv" -> "https://arxiv.org/abs/" + "jstor" -> "https://www.jstor.org/stable/" + "pubmed" -> "https://www.ncbi.nlm.nih.gov/pubmed/" + "googlebooks" -> "https://books.google.com?id=" _ -> "" if T.null baseUrl then mzero @@ -449,7 +717,7 @@ itemToReference locale variant item = do bib :: Item -> Bib a -> BibParser a -bib entry m = fst <$> evalRWST m entry (BibState True (Lang "en" (Just "US"))) +bib entry m = fst <$> evalRWST m entry (BibState True defaultLang) resolveCrossRefs :: Variant -> [Item] -> [Item] resolveCrossRefs variant entries = @@ -502,41 +770,10 @@ blocksToInlines bs = _ -> B.fromList $ Walk.query (:[]) bs adjustSpans :: Lang -> Inline -> Inline -adjustSpans lang (RawInline (Format "latex") s) - | s == "\\hyphen" || s == "\\hyphen " = Str "-" - | otherwise = parseRawLaTeX lang s +adjustSpans lang (Span ("",[],[("bibstring",s)]) _) = Str $ resolveKey' lang s adjustSpans _ SoftBreak = Space adjustSpans _ x = x -parseRawLaTeX :: Lang -> Text -> Inline -parseRawLaTeX lang t@(T.stripPrefix "\\" -> Just xs) = - case parseLaTeX lang contents of - Right [Para ys] -> f command ys - Right [Plain ys] -> f command ys - Right [] -> f command [] - _ -> RawInline (Format "latex") t - where (command', contents') = T.break (\c -> c =='{' || c =='\\') xs - command = T.strip command' - contents = T.drop 1 $ T.dropEnd 1 contents' - f "mkbibquote" ils = Span nullAttr [Quoted DoubleQuote ils] - f "mkbibemph" ils = Span nullAttr [Emph ils] - f "mkbibitalic" ils = Span nullAttr [Emph ils] - f "mkbibbold" ils = Span nullAttr [Strong ils] - f "mkbibparens" ils = Span nullAttr $ - [Str "("] ++ ils ++ [Str ")"] - f "mkbibbrackets" ils = Span nullAttr $ - [Str "["] ++ ils ++ [Str "]"] - -- ... both should be nestable & should work in year fields - f "autocap" ils = Span nullAttr ils - -- TODO: should work in year fields - f "textnormal" ils = Span ("",["nodecor"],[]) ils - f "bibstring" [Str s] = Str $ resolveKey' lang s - f "adddot" [] = Str "." - f "adddotspace" [] = Span nullAttr [Str ".", Space] - f "addabbrvspace" [] = Space - f _ ils = Span nullAttr ils -parseRawLaTeX _ t = RawInline (Format "latex") t - latex' :: Text -> Bib [Block] latex' t = do lang <- gets localeLang @@ -572,7 +809,7 @@ bibEntries = do (bibComment <|> bibPreamble <|> bibString)) bibSkip :: BibParser () -bibSkip = () <$ take1WhileP (/='@') +bibSkip = skipMany1 (satisfy (/='@')) bibComment :: BibParser () bibComment = do @@ -597,11 +834,14 @@ bibString = do updateState (\(l,m) -> (l, Map.insert k v m)) return () +take1WhileP :: Monad m => (Char -> Bool) -> ParserT Sources u m Text +take1WhileP f = T.pack <$> many1 (satisfy f) + inBraces :: BibParser Text inBraces = do char '{' res <- manyTill - ( (T.pack <$> many1 (noneOf "{}\\")) + ( take1WhileP (\c -> c /= '{' && c /= '}' && c /= '\\') <|> (char '\\' >> ( (char '{' >> return "\\{") <|> (char '}' >> return "\\}") <|> return "\\")) @@ -616,7 +856,7 @@ inQuotes :: BibParser Text inQuotes = do char '"' T.concat <$> manyTill - ( (T.pack <$> many1 (noneOf "\"\\{")) + ( take1WhileP (\c -> c /= '{' && c /= '"' && c /= '\\') <|> (char '\\' >> T.cons '\\' . T.singleton <$> anyChar) <|> braced <$> inBraces ) (char '"') @@ -628,7 +868,7 @@ fieldName = resolveAlias . T.toLower isBibtexKeyChar :: Char -> Bool isBibtexKeyChar c = - isAlphaNum c || c `elem` (".:;?!`'()/*@_+=-[]*&" :: [Char]) + isAlphaNum c || c `elem` (".:;?!`'()$/*@_+=-[]*&" :: [Char]) bibItem :: BibParser Item bibItem = do @@ -812,14 +1052,14 @@ getOldDate prefix = do let dateparts = filter (\x -> x /= DateParts []) $ map toDateParts [(year',month',day'), (endyear',endmonth',endday')] - literal <- if null dateparts - then Just <$> getRawField (prefix <> "year") - else return Nothing + literal' <- if null dateparts + then Just <$> getRawField (prefix <> "year") + else return Nothing return $ Date { dateParts = dateparts , dateCirca = False , dateSeason = Nothing - , dateLiteral = literal } + , dateLiteral = literal' } getRawField :: Text -> Bib Text getRawField f = do @@ -1225,8 +1465,9 @@ resolveKey lang ils = Walk.walk go ils go x = x resolveKey' :: Lang -> Text -> Text -resolveKey' lang@(Lang l _) k = - case Map.lookup l biblatexStringMap >>= Map.lookup (T.toLower k) of +resolveKey' lang k = + case Map.lookup (langLanguage lang) biblatexStringMap >>= + Map.lookup (T.toLower k) of Nothing -> k Just (x, _) -> either (const k) stringify $ parseLaTeX lang x diff --git a/src/Text/Pandoc/Citeproc/Data.hs b/src/Text/Pandoc/Citeproc/Data.hs index dfdaf2598..848a83a1e 100644 --- a/src/Text/Pandoc/Citeproc/Data.hs +++ b/src/Text/Pandoc/Citeproc/Data.hs @@ -10,7 +10,7 @@ import qualified Data.Text.Encoding as TE import qualified Data.Text as T import Data.Text (Text) import Text.Pandoc.Citeproc.Util (toIETF) -import Citeproc (Lang(..), parseLang) +import Text.Collate.Lang (Lang(..), parseLang) biblatexLocalizations :: [(FilePath, ByteString)] biblatexLocalizations = $(embedDir "citeproc/biblatex-localization") @@ -21,11 +21,12 @@ biblatexStringMap :: M.Map Text (M.Map Text (Text, Text)) biblatexStringMap = foldr go mempty biblatexLocalizations where go (fp, bs) = - let Lang lang _ = parseLang (toIETF $ T.takeWhile (/= '.') $ T.pack fp) - ls = T.lines $ TE.decodeUtf8 bs - in if length ls > 4 - then M.insert lang (toStringMap $ map (T.splitOn "|") ls) - else id + let ls = T.lines $ TE.decodeUtf8 bs + in case parseLang (toIETF $ T.takeWhile (/= '.') $ T.pack fp) of + Right lang | length ls > 4 + -> M.insert (langLanguage lang) + (toStringMap $ map (T.splitOn "|") ls) + _ -> id toStringMap = foldr go' mempty go' [term, x, y] = M.insert term (x, y) go' _ = id diff --git a/src/Text/Pandoc/Citeproc/Locator.hs b/src/Text/Pandoc/Citeproc/Locator.hs index dba762c02..f8931d7b5 100644 --- a/src/Text/Pandoc/Citeproc/Locator.hs +++ b/src/Text/Pandoc/Citeproc/Locator.hs @@ -7,6 +7,7 @@ where import Citeproc.Types import Data.Text (Text) import qualified Data.Text as T +import Data.List (foldl') import Text.Parsec import Text.Pandoc.Definition import Text.Pandoc.Parsing (romanNumeral) @@ -19,7 +20,7 @@ parseLocator :: Locale -> [Inline] -> (Maybe (Text, Text), [Inline]) parseLocator locale inp = case parse (pLocatorWords (toLocatorMap locale)) "suffix" $ splitInp inp of Right r -> r - Left _ -> (Nothing, inp) + Left _ -> (Nothing, maybeAddComma inp) splitInp :: [Inline] -> [Inline] splitInp = splitStrWhen (\c -> isSpace c || (isPunctuation c && c /= ':')) @@ -41,9 +42,17 @@ pLocatorWords locMap = do -- i.e. the first one will be " 9" return $ if T.null la && T.null lo - then (Nothing, s) + then (Nothing, maybeAddComma s) else (Just (la, T.strip lo), s) +maybeAddComma :: [Inline] -> [Inline] +maybeAddComma [] = [] +maybeAddComma ils@(Space : _) = ils +maybeAddComma ils@(Str t : _) + | Just (c, _) <- T.uncons t + , isPunctuation c = ils +maybeAddComma ils = Str "," : Space : ils + pLocatorDelimited :: LocatorMap -> LocatorParser (Text, Text) pLocatorDelimited locMap = try $ do _ <- pMatchChar "{" (== '{') @@ -96,7 +105,7 @@ pLocatorLabel' locMap lim = go "" t <- anyToken ts <- manyTill anyToken (try $ lookAhead lim) let s = acc <> stringify (t:ts) - case M.lookup (T.strip s) locMap of + case M.lookup (T.toCaseFold $ T.strip s) locMap of -- try to find a longer one, or return this one Just l -> go s <|> return (l, False) Nothing -> go s @@ -139,7 +148,7 @@ pBalancedBraces braces p = try $ do where except = notFollowedBy pBraces >> p -- outer and inner - surround = foldl (\a (open, close) -> sur open close except <|> a) + surround = foldl' (\a (open, close) -> sur open close except <|> a) except braces @@ -180,6 +189,7 @@ pPageUnit = roman <|> plainUnit plainUnit = do ts <- many1 (notFollowedBy pSpace >> notFollowedBy pLocatorPunct >> + notFollowedBy pMath >> anyToken) let s = stringify ts -- otherwise look for actual digits or -s @@ -210,6 +220,12 @@ pMatchChar msg f = satisfyTok f' <?> msg pSpace :: LocatorParser Inline pSpace = satisfyTok (\t -> isSpacey t || t == Str "\160") <?> "space" +pMath :: LocatorParser Inline +pMath = satisfyTok isMath + where + isMath (Math{}) = True + isMath _ = False + satisfyTok :: (Inline -> Bool) -> LocatorParser Inline satisfyTok f = tokenPrim show (\sp _ _ -> sp) (\tok -> if f tok then Just tok diff --git a/src/Text/Pandoc/Citeproc/MetaValue.hs b/src/Text/Pandoc/Citeproc/MetaValue.hs index f5a49f49e..b43ca7314 100644 --- a/src/Text/Pandoc/Citeproc/MetaValue.hs +++ b/src/Text/Pandoc/Citeproc/MetaValue.hs @@ -135,12 +135,13 @@ metaValueToVal k v MetaMap _ -> TextVal mempty metaValueToDate :: MetaValue -> Date -metaValueToDate (MetaMap m) = - Date +metaValueToDate (MetaMap m) = fromMaybe + (Date { dateParts = dateparts , dateCirca = circa , dateSeason = season - , dateLiteral = literal } + , dateLiteral = literal }) + rawdate where dateparts = case M.lookup "date-parts" m of Just (MetaList xs) -> @@ -152,6 +153,7 @@ metaValueToDate (MetaMap m) = M.lookup "circa" m >>= metaValueToBool season = M.lookup "season" m >>= metaValueToInt literal = M.lookup "literal" m >>= metaValueToText + rawdate = M.lookup "raw" m >>= metaValueToText >>= rawDateEDTF metaValueToDate (MetaList xs) = Date{ dateParts = mapMaybe metaValueToDateParts xs , dateCirca = False diff --git a/src/Text/Pandoc/Class/CommonState.hs b/src/Text/Pandoc/Class/CommonState.hs index 7e1735c2b..796a4afd5 100644 --- a/src/Text/Pandoc/Class/CommonState.hs +++ b/src/Text/Pandoc/Class/CommonState.hs @@ -19,7 +19,7 @@ where import Data.Default (Default (def)) import Data.Text (Text) -import Text.Pandoc.BCP47 (Lang) +import Text.Collate.Lang (Lang) import Text.Pandoc.MediaBag (MediaBag) import Text.Pandoc.Logging (LogMessage, Verbosity (WARNING)) import Text.Pandoc.Translations (Translations) diff --git a/src/Text/Pandoc/Class/IO.hs b/src/Text/Pandoc/Class/IO.hs index eecda5711..f4cfc8682 100644 --- a/src/Text/Pandoc/Class/IO.hs +++ b/src/Text/Pandoc/Class/IO.hs @@ -62,7 +62,7 @@ import Text.Pandoc.Definition (Pandoc, Inline (Image)) import Text.Pandoc.Error (PandocError (..)) import Text.Pandoc.Logging (LogMessage (..), messageVerbosity, showLogMessage) import Text.Pandoc.MIME (MimeType) -import Text.Pandoc.MediaBag (MediaBag, lookupMedia, mediaDirectory) +import Text.Pandoc.MediaBag (MediaBag, MediaItem(..), lookupMedia, mediaItems) import Text.Pandoc.Walk (walk) import qualified Control.Exception as E import qualified Data.ByteString as B @@ -183,7 +183,7 @@ getModificationTime = liftIOError System.Directory.getModificationTime logOutput :: (PandocMonad m, MonadIO m) => LogMessage -> m () logOutput msg = liftIO $ do UTF8.hPutStr stderr $ - "[" ++ show (messageVerbosity msg) ++ "] " + "[" <> T.pack (show (messageVerbosity msg)) <> "] " alertIndent $ T.lines $ showLogMessage msg -- | Prints the list of lines to @stderr@, indenting every but the first @@ -191,42 +191,41 @@ logOutput msg = liftIO $ do alertIndent :: [Text] -> IO () alertIndent [] = return () alertIndent (l:ls) = do - UTF8.hPutStrLn stderr $ unpack l + UTF8.hPutStrLn stderr l mapM_ go ls where go l' = do UTF8.hPutStr stderr " " - UTF8.hPutStrLn stderr $ unpack l' + UTF8.hPutStrLn stderr l' -- | Extract media from the mediabag into a directory. extractMedia :: (PandocMonad m, MonadIO m) => FilePath -> Pandoc -> m Pandoc extractMedia dir d = do media <- getMediaBag - case [fp | (fp, _, _) <- mediaDirectory media] of - [] -> return d - fps -> do - mapM_ (writeMedia dir media) fps - return $ walk (adjustImagePath dir fps) d + let items = mediaItems media + if null items + then return d + else do + mapM_ (writeMedia dir) items + return $ walk (adjustImagePath dir media) d -- | Write the contents of a media bag to a path. writeMedia :: (PandocMonad m, MonadIO m) - => FilePath -> MediaBag -> FilePath + => FilePath + -> (FilePath, MimeType, BL.ByteString) -> m () -writeMedia dir mediabag subpath = do - -- we join and split to convert a/b/c to a\b\c on Windows; - -- in zip containers all paths use / - let fullpath = dir </> unEscapeString (normalise subpath) - let mbcontents = lookupMedia subpath mediabag - case mbcontents of - Nothing -> throwError $ PandocResourceNotFound $ pack subpath - Just (_, bs) -> do - report $ Extracting $ pack fullpath - liftIOError (createDirectoryIfMissing True) (takeDirectory fullpath) - logIOError $ BL.writeFile fullpath bs +writeMedia dir (fp, _mt, bs) = do + -- we normalize to get proper path separators for the platform + let fullpath = normalise $ dir </> fp + liftIOError (createDirectoryIfMissing True) (takeDirectory fullpath) + logIOError $ BL.writeFile fullpath bs -- | If the given Inline element is an image with a @src@ path equal to -- one in the list of @paths@, then prepends @dir@ to the image source; -- returns the element unchanged otherwise. -adjustImagePath :: FilePath -> [FilePath] -> Inline -> Inline -adjustImagePath dir paths (Image attr lab (src, tit)) - | unpack src `elem` paths - = Image attr lab (pack (normalise $ dir </> unpack src), tit) +adjustImagePath :: FilePath -> MediaBag -> Inline -> Inline +adjustImagePath dir mediabag (Image attr lab (src, tit)) = + case lookupMedia (T.unpack src) mediabag of + Nothing -> Image attr lab (src, tit) + Just item -> + let fullpath = dir <> "/" <> mediaPath item + in Image attr lab (T.pack fullpath, tit) adjustImagePath _ _ x = x diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs index 374da161b..439aec071 100644 --- a/src/Text/Pandoc/Class/PandocMonad.hs +++ b/src/Text/Pandoc/Class/PandocMonad.hs @@ -1,4 +1,5 @@ {-# LANGUAGE CPP #-} +{-# LANGUAGE TupleSections #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} @@ -37,7 +38,6 @@ module Text.Pandoc.Class.PandocMonad , setUserDataDir , getUserDataDir , fetchItem - , fetchMediaResource , getInputFiles , setInputFiles , getOutputFile @@ -51,30 +51,31 @@ module Text.Pandoc.Class.PandocMonad , setTranslations , translateTerm , makeCanonical + , getTimestamp ) where import Codec.Archive.Zip import Control.Monad.Except (MonadError (catchError, throwError), MonadTrans, lift, when) -import Data.Digest.Pure.SHA (sha1, showDigest) -import Data.Maybe (fromMaybe) +import Data.List (foldl') import Data.Time (UTCTime) -import Data.Time.Clock.POSIX (POSIXTime, utcTimeToPOSIXSeconds) +import Data.Time.Clock.POSIX (POSIXTime, utcTimeToPOSIXSeconds, + posixSecondsToUTCTime) import Data.Time.LocalTime (TimeZone, ZonedTime, utcToZonedTime) import Network.URI ( escapeURIString, nonStrictRelativeTo, unEscapeString, parseURIReference, isAllowedInURI, parseURI, URI(..) ) -import System.FilePath ((</>), (<.>), takeExtension, dropExtension, - isRelative, splitDirectories) +import System.FilePath ((</>), takeExtension, dropExtension, + isRelative, splitDirectories, makeRelative) import System.Random (StdGen) -import Text.Pandoc.BCP47 (Lang(..), parseBCP47, renderLang) +import Text.Collate.Lang (Lang(..), parseLang, renderLang) import Text.Pandoc.Class.CommonState (CommonState (..)) import Text.Pandoc.Definition import Text.Pandoc.Error import Text.Pandoc.Logging -import Text.Pandoc.MIME (MimeType, getMimeType, extensionFromMimeType) -import Text.Pandoc.MediaBag (MediaBag, lookupMedia) -import Text.Pandoc.Shared (uriPathToPath) +import Text.Pandoc.MIME (MimeType, getMimeType) +import Text.Pandoc.MediaBag (MediaBag, lookupMedia, MediaItem(..)) +import Text.Pandoc.Shared (uriPathToPath, safeRead) import Text.Pandoc.Translations (Term(..), Translations, lookupTerm, readTranslations) import Text.Pandoc.Walk (walkM) @@ -175,6 +176,21 @@ report msg = do when (level <= verbosity) $ logOutput msg modifyCommonState $ \st -> st{ stLog = msg : stLog st } +-- | Get the time from the @SOURCE_DATE_EPOCH@ +-- environment variable. The variable should contain a +-- unix time stamp, the number of seconds since midnight Jan 01 +-- 1970 UTC. If the variable is not set or cannot be +-- parsed as a unix time stamp, the current time is returned. +-- This function is designed to make possible reproducible +-- builds in formats that include a creation timestamp. +getTimestamp :: PandocMonad m => m UTCTime +getTimestamp = do + mbSourceDateEpoch <- lookupEnv "SOURCE_DATE_EPOCH" + case mbSourceDateEpoch >>= safeRead of + Just (epoch :: Integer) -> + return $ posixSecondsToUTCTime $ fromIntegral epoch + Nothing -> getCurrentTime + -- | Determine whether tracing is enabled. This affects -- the behavior of 'trace'. If tracing is not enabled, -- 'trace' does nothing. @@ -267,7 +283,7 @@ readFileFromDirs (d:ds) f = catchError toLang :: PandocMonad m => Maybe T.Text -> m (Maybe Lang) toLang Nothing = return Nothing toLang (Just s) = - case parseBCP47 s of + case parseLang s of Left _ -> do report $ InvalidLang s return Nothing @@ -358,7 +374,8 @@ fetchItem :: PandocMonad m fetchItem s = do mediabag <- getMediaBag case lookupMedia (T.unpack s) mediabag of - Just (mime, bs) -> return (BL.toStrict bs, Just mime) + Just item -> return (BL.toStrict (mediaContents item), + Just (mediaMimeType item)) Nothing -> downloadOrRead s -- | Returns the content and, if available, the MIME type of a resource. @@ -393,9 +410,10 @@ downloadOrRead s = do _ -> readLocalFile fp -- get from local file system where readLocalFile f = do resourcePath <- getResourcePath - cont <- if isRelative f - then withPaths resourcePath readFileStrict f - else readFileStrict f + (fp', cont) <- if isRelative f + then withPaths resourcePath readFileStrict f + else (f,) <$> readFileStrict f + report $ LoadedResource f (makeRelative "." fp') return (cont, mime) httpcolon = URI{ uriScheme = "http:", uriAuthority = Nothing, @@ -595,7 +613,7 @@ checkExistence fn = do -- | Canonicalizes a file path by removing redundant @.@ and @..@. makeCanonical :: FilePath -> FilePath makeCanonical = Posix.joinPath . transformPathParts . splitDirectories - where transformPathParts = reverse . foldl go [] + where transformPathParts = reverse . foldl' go [] go as "." = as go (_:as) ".." = as go as x = x : as @@ -605,25 +623,13 @@ makeCanonical = Posix.joinPath . transformPathParts . splitDirectories -- that filepath. Returns the result of the first successful execution -- of the action, or throws a @PandocResourceNotFound@ exception if the -- action errors for all filepaths. -withPaths :: PandocMonad m => [FilePath] -> (FilePath -> m a) -> FilePath -> m a +withPaths :: PandocMonad m + => [FilePath] -> (FilePath -> m a) -> FilePath -> m (FilePath, a) withPaths [] _ fp = throwError $ PandocResourceNotFound $ T.pack fp withPaths (p:ps) action fp = - catchError (action (p </> fp)) + catchError ((p </> fp,) <$> action (p </> fp)) (\_ -> withPaths ps action fp) --- | Fetch local or remote resource (like an image) and provide data suitable --- for adding it to the MediaBag. -fetchMediaResource :: PandocMonad m - => T.Text -> m (FilePath, Maybe MimeType, BL.ByteString) -fetchMediaResource src = do - (bs, mt) <- downloadOrRead src - let ext = fromMaybe (T.pack $ takeExtension $ T.unpack src) - (mt >>= extensionFromMimeType) - let bs' = BL.fromChunks [bs] - let basename = showDigest $ sha1 bs' - let fname = basename <.> T.unpack ext - return (fname, mt, bs') - -- | Traverse tree, filling media bag for any images that -- aren't already in the media bag. fillMediaBag :: PandocMonad m => Pandoc -> m Pandoc @@ -631,12 +637,13 @@ fillMediaBag d = walkM handleImage d where handleImage :: PandocMonad m => Inline -> m Inline handleImage (Image attr lab (src, tit)) = catchError (do mediabag <- getMediaBag - case lookupMedia (T.unpack src) mediabag of - Just (_, _) -> return $ Image attr lab (src, tit) + let fp = T.unpack src + case lookupMedia fp mediabag of + Just _ -> return () Nothing -> do - (fname, mt, bs) <- fetchMediaResource src - insertMedia fname mt bs - return $ Image attr lab (T.pack fname, tit)) + (bs, mt) <- fetchItem src + insertMedia fp mt (BL.fromStrict bs) + return $ Image attr lab (src, tit)) (\e -> case e of PandocResourceNotFound _ -> do diff --git a/src/Text/Pandoc/Data.hs b/src/Text/Pandoc/Data.hs index 38682b9f9..55ed3f5bf 100644 --- a/src/Text/Pandoc/Data.hs +++ b/src/Text/Pandoc/Data.hs @@ -1,7 +1,7 @@ {-# LANGUAGE TemplateHaskell #-} {- | Module : Text.Pandoc.Data -Copyright : Copyright (C) 2013-2020 John MacFarlane +Copyright : Copyright (C) 2013-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley@edu> diff --git a/src/Text/Pandoc/Error.hs b/src/Text/Pandoc/Error.hs index 2c311bb49..9dee8356b 100644 --- a/src/Text/Pandoc/Error.hs +++ b/src/Text/Pandoc/Error.hs @@ -3,7 +3,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Error - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -16,38 +16,41 @@ encountered during parsing. -} module Text.Pandoc.Error ( PandocError(..), + renderError, handleError) where -import Control.Exception (Exception) +import Control.Exception (Exception, displayException) import Data.Typeable (Typeable) import Data.Word (Word8) import Data.Text (Text) +import Data.List (sortOn) import qualified Data.Text as T +import Data.Ord (Down(..)) import GHC.Generics (Generic) import Network.HTTP.Client (HttpException) import System.Exit (ExitCode (..), exitWith) import System.IO (stderr) import qualified Text.Pandoc.UTF8 as UTF8 +import Text.Pandoc.Sources (Sources(..)) import Text.Printf (printf) import Text.Parsec.Error import Text.Parsec.Pos hiding (Line) import Text.Pandoc.Shared (tshow) import Citeproc (CiteprocError, prettyCiteprocError) -type Input = Text - data PandocError = PandocIOError Text IOError | PandocHttpError Text HttpException | PandocShouldNeverHappenError Text | PandocSomeError Text | PandocParseError Text - | PandocParsecError Input ParseError + | PandocParsecError Sources ParseError | PandocMakePDFError Text | PandocOptionError Text | PandocSyntaxMapError Text | PandocFailOnWarningError | PandocPDFProgramNotFoundError Text | PandocPDFError Text + | PandocXMLError Text Text | PandocFilterError Text Text | PandocLuaError Text | PandocCouldNotFindDataFileError Text @@ -58,6 +61,7 @@ data PandocError = PandocIOError Text IOError | PandocMacroLoop Text | PandocUTF8DecodingError Text Int Word8 | PandocIpynbDecodingError Text + | PandocUnsupportedCharsetError Text | PandocUnknownReaderError Text | PandocUnknownWriterError Text | PandocUnsupportedExtensionError Text Text @@ -67,62 +71,70 @@ data PandocError = PandocIOError Text IOError instance Exception PandocError --- | Handle PandocError by exiting with an error message. -handleError :: Either PandocError a -> IO a -handleError (Right r) = return r -handleError (Left e) = +renderError :: PandocError -> Text +renderError e = case e of - PandocIOError _ err' -> ioError err' - PandocHttpError u err' -> err 61 $ + PandocIOError _ err' -> T.pack $ displayException err' + PandocHttpError u err' -> "Could not fetch " <> u <> "\n" <> tshow err' - PandocShouldNeverHappenError s -> err 62 $ + PandocShouldNeverHappenError s -> "Something we thought was impossible happened!\n" <> "Please report this to pandoc's developers: " <> s - PandocSomeError s -> err 63 s - PandocParseError s -> err 64 s - PandocParsecError input err' -> + PandocSomeError s -> s + PandocParseError s -> s + PandocParsecError (Sources inputs) err' -> let errPos = errorPos err' errLine = sourceLine errPos errColumn = sourceColumn errPos - ls = T.lines input <> [""] - errorInFile = if length ls > errLine - 1 - then T.concat ["\n", ls !! (errLine - 1) - ,"\n", T.replicate (errColumn - 1) " " - ,"^"] - else "" - in err 65 $ "\nError at " <> tshow err' <> - -- if error comes from a chunk or included file, - -- then we won't get the right text this way: - if sourceName errPos == "source" - then errorInFile - else "" - PandocMakePDFError s -> err 66 s - PandocOptionError s -> err 6 s - PandocSyntaxMapError s -> err 67 s - PandocFailOnWarningError -> err 3 "Failing because there were warnings." - PandocPDFProgramNotFoundError pdfprog -> err 47 $ + errFile = sourceName errPos + errorInFile = + case sortOn (Down . sourceLine . fst) + [ (pos,t) + | (pos,t) <- inputs + , sourceName pos == errFile + , sourceLine pos <= errLine + ] of + [] -> "" + ((pos,txt):_) -> + let ls = T.lines txt <> [""] + ln = (errLine - sourceLine pos) + 1 + in if length ls > ln && ln >= 1 + then T.concat ["\n", ls !! (ln - 1) + ,"\n", T.replicate (errColumn - 1) " " + ,"^"] + else "" + in "Error at " <> tshow err' <> errorInFile + PandocMakePDFError s -> s + PandocOptionError s -> s + PandocSyntaxMapError s -> s + PandocFailOnWarningError -> "Failing because there were warnings." + PandocPDFProgramNotFoundError pdfprog -> pdfprog <> " not found. Please select a different --pdf-engine or install " <> pdfprog - PandocPDFError logmsg -> err 43 $ "Error producing PDF.\n" <> logmsg - PandocFilterError filtername msg -> err 83 $ "Error running filter " <> + PandocPDFError logmsg -> "Error producing PDF.\n" <> logmsg + PandocXMLError fp logmsg -> "Invalid XML" <> + (if T.null fp then "" else " in " <> fp) <> ":\n" <> logmsg + PandocFilterError filtername msg -> "Error running filter " <> filtername <> ":\n" <> msg - PandocLuaError msg -> err 84 $ "Error running Lua:\n" <> msg - PandocCouldNotFindDataFileError fn -> err 97 $ + PandocLuaError msg -> "Error running Lua:\n" <> msg + PandocCouldNotFindDataFileError fn -> "Could not find data file " <> fn - PandocResourceNotFound fn -> err 99 $ + PandocResourceNotFound fn -> "File " <> fn <> " not found in resource path" - PandocTemplateError s -> err 5 $ "Error compiling template " <> s - PandocAppError s -> err 4 s - PandocEpubSubdirectoryError s -> err 31 $ + PandocTemplateError s -> "Error compiling template " <> s + PandocAppError s -> s + PandocEpubSubdirectoryError s -> "EPUB subdirectory name '" <> s <> "' contains illegal characters" - PandocMacroLoop s -> err 91 $ + PandocMacroLoop s -> "Loop encountered in expanding macro " <> s - PandocUTF8DecodingError f offset w -> err 92 $ + PandocUTF8DecodingError f offset w -> "UTF-8 decoding error in " <> f <> " at byte offset " <> tshow offset <> " (" <> T.pack (printf "%2x" w) <> ").\n" <> "The input must be a UTF-8 encoded text." - PandocIpynbDecodingError w -> err 93 $ + PandocIpynbDecodingError w -> "ipynb decoding error: " <> w - PandocUnknownReaderError r -> err 21 $ + PandocUnsupportedCharsetError charset -> + "Unsupported charset " <> charset + PandocUnknownReaderError r -> "Unknown input format " <> r <> case r of "doc" -> "\nPandoc can convert from DOCX, but not from DOC." <> @@ -130,7 +142,7 @@ handleError (Left e) = " and convert that with pandoc." "pdf" -> "\nPandoc can convert to PDF, but not from PDF." _ -> "" - PandocUnknownWriterError w -> err 22 $ + PandocUnknownWriterError w -> "Unknown output format " <> w <> case w of "pdf" -> "To create a pdf using pandoc, use" <> @@ -139,16 +151,57 @@ handleError (Left e) = ".pdf extension (-o filename.pdf)." "doc" -> "\nPandoc can convert to DOCX, but not to DOC." _ -> "" - PandocUnsupportedExtensionError ext f -> err 23 $ + PandocUnsupportedExtensionError ext f -> "The extension " <> ext <> " is not supported " <> "for " <> f - PandocCiteprocError e' -> err 24 $ + PandocCiteprocError e' -> prettyCiteprocError e' - PandocBibliographyError fp msg -> err 25 $ + PandocBibliographyError fp msg -> "Error reading bibliography file " <> fp <> ":\n" <> msg + +-- | Handle PandocError by exiting with an error message. +handleError :: Either PandocError a -> IO a +handleError (Right r) = return r +handleError (Left e) = + case e of + PandocIOError _ err' -> ioError err' + _ -> err exitCode (renderError e) + where + exitCode = + case e of + PandocIOError{} -> 1 + PandocHttpError{} -> 61 + PandocShouldNeverHappenError{} -> 62 + PandocSomeError{} -> 63 + PandocParseError{} -> 64 + PandocParsecError{} -> 65 + PandocMakePDFError{} -> 66 + PandocOptionError{} -> 6 + PandocSyntaxMapError{} -> 67 + PandocFailOnWarningError{} -> 3 + PandocPDFProgramNotFoundError{} -> 47 + PandocPDFError{} -> 43 + PandocXMLError{} -> 44 + PandocFilterError{} -> 83 + PandocLuaError{} -> 84 + PandocCouldNotFindDataFileError{} -> 97 + PandocResourceNotFound{} -> 99 + PandocTemplateError{} -> 5 + PandocAppError{} -> 4 + PandocEpubSubdirectoryError{} -> 31 + PandocMacroLoop{} -> 91 + PandocUTF8DecodingError{} -> 92 + PandocIpynbDecodingError{} -> 93 + PandocUnsupportedCharsetError{} -> 94 + PandocUnknownReaderError{} -> 21 + PandocUnknownWriterError{} -> 22 + PandocUnsupportedExtensionError{} -> 23 + PandocCiteprocError{} -> 24 + PandocBibliographyError{} -> 25 + err :: Int -> Text -> IO a err exitCode msg = do - UTF8.hPutStrLn stderr (T.unpack msg) + UTF8.hPutStrLn stderr msg exitWith $ ExitFailure exitCode return undefined diff --git a/src/Text/Pandoc/Extensions.hs b/src/Text/Pandoc/Extensions.hs index 646f7abfb..9c55d0a7a 100644 --- a/src/Text/Pandoc/Extensions.hs +++ b/src/Text/Pandoc/Extensions.hs @@ -6,7 +6,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Extensions - Copyright : Copyright (C) 2012-2020 John MacFarlane + Copyright : Copyright (C) 2012-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -34,6 +34,7 @@ module Text.Pandoc.Extensions ( Extension(..) where import Data.Bits (clearBit, setBit, testBit, (.|.)) import Data.Data (Data) +import Data.List (foldl') import qualified Data.Text as T import Data.Typeable (Typeable) import GHC.Generics (Generic) @@ -88,6 +89,7 @@ data Extension = -- does not affect readers/writers directly; it causes -- the eastAsianLineBreakFilter to be applied after -- parsing, in Text.Pandoc.App.convertWithOpts. + | Ext_element_citations -- ^ Use element-citation elements for JATS citations | Ext_emoji -- ^ Support emoji like :smile: | Ext_empty_paragraphs -- ^ Allow empty paragraphs | Ext_epub_html_exts -- ^ Recognise the EPUB extended version of HTML @@ -134,6 +136,8 @@ data Extension = | Ext_raw_html -- ^ Allow raw HTML | Ext_raw_tex -- ^ Allow raw TeX (other than math) | Ext_raw_markdown -- ^ Parse markdown in ipynb as raw markdown + | Ext_rebase_relative_paths -- ^ Rebase relative image and link paths, + -- relative to directory of containing file | Ext_shortcut_reference_links -- ^ Shortcut reference links | Ext_simple_tables -- ^ Pandoc-style simple tables | Ext_smart -- ^ "Smart" quotes, apostrophes, ellipses, dashes @@ -149,9 +153,12 @@ data Extension = | Ext_tex_math_dollars -- ^ TeX math between $..$ or $$..$$ | Ext_tex_math_double_backslash -- ^ TeX math btw \\(..\\) \\[..\\] | Ext_tex_math_single_backslash -- ^ TeX math btw \(..\) \[..\] + | Ext_xrefs_name -- ^ Use xrefs with names + | Ext_xrefs_number -- ^ Use xrefs with numbers | Ext_yaml_metadata_block -- ^ YAML metadata block | Ext_gutenberg -- ^ Use Project Gutenberg conventions for plain | Ext_attributes -- ^ Generic attribute syntax + | Ext_sourcepos -- ^ Include source position attributes deriving (Show, Read, Enum, Eq, Ord, Bounded, Data, Typeable, Generic) -- | Extensions to be used with pandoc-flavored markdown. @@ -349,6 +356,7 @@ getDefaultExtensions "gfm" = extensionsFromList , Ext_strikeout , Ext_task_lists , Ext_emoji + , Ext_yaml_metadata_block ] getDefaultExtensions "commonmark" = extensionsFromList [Ext_raw_html] @@ -374,10 +382,11 @@ getDefaultExtensions "commonmark_x" = extensionsFromList , Ext_raw_attribute , Ext_implicit_header_references , Ext_attributes - , Ext_fenced_code_attributes + , Ext_yaml_metadata_block ] getDefaultExtensions "org" = extensionsFromList [Ext_citations, + Ext_task_lists, Ext_auto_identifiers] getDefaultExtensions "html" = extensionsFromList [Ext_auto_identifiers, @@ -409,6 +418,11 @@ getDefaultExtensions "textile" = extensionsFromList Ext_smart, Ext_raw_html, Ext_auto_identifiers] +getDefaultExtensions "jats" = extensionsFromList + [Ext_auto_identifiers] +getDefaultExtensions "jats_archiving" = getDefaultExtensions "jats" +getDefaultExtensions "jats_publishing" = getDefaultExtensions "jats" +getDefaultExtensions "jats_articleauthoring" = getDefaultExtensions "jats" getDefaultExtensions "opml" = pandocExtensions -- affects notes getDefaultExtensions _ = extensionsFromList [Ext_auto_identifiers] @@ -450,6 +464,7 @@ getAllExtensions f = universalExtensions <> getAll f , Ext_gutenberg , Ext_smart , Ext_literate_haskell + , Ext_rebase_relative_paths ] getAll "markdown_strict" = allMarkdownExtensions getAll "markdown_phpextra" = allMarkdownExtensions @@ -465,6 +480,8 @@ getAllExtensions f = universalExtensions <> getAll f getAll "opendocument" = extensionsFromList [ Ext_empty_paragraphs , Ext_native_numbering + , Ext_xrefs_name + , Ext_xrefs_number ] getAll "odt" = getAll "opendocument" <> autoIdExtensions getAll "muse" = autoIdExtensions <> @@ -498,13 +515,16 @@ getAllExtensions f = universalExtensions <> getAll f , Ext_raw_attribute , Ext_implicit_header_references , Ext_attributes - , Ext_fenced_code_attributes + , Ext_sourcepos + , Ext_yaml_metadata_block + , Ext_rebase_relative_paths ] getAll "commonmark_x" = getAll "commonmark" getAll "org" = autoIdExtensions <> extensionsFromList [ Ext_citations , Ext_smart + , Ext_task_lists ] getAll "html" = autoIdExtensions <> extensionsFromList @@ -548,6 +568,14 @@ getAllExtensions f = universalExtensions <> getAll f , Ext_smart , Ext_raw_tex ] + getAll "jats" = + extensionsFromList + [ Ext_auto_identifiers + , Ext_element_citations + ] + getAll "jats_archiving" = getAll "jats" + getAll "jats_publishing" = getAll "jats" + getAll "jats_articleauthoring" = getAll "jats" getAll "opml" = allMarkdownExtensions -- affects notes getAll "twiki" = autoIdExtensions <> extensionsFromList @@ -573,7 +601,7 @@ parseFormatSpec :: T.Text parseFormatSpec = parse formatSpec "" where formatSpec = do name <- formatName - (extsToEnable, extsToDisable) <- foldl (flip ($)) ([],[]) <$> + (extsToEnable, extsToDisable) <- foldl' (flip ($)) ([],[]) <$> many extMod return (T.pack name, reverse extsToEnable, reverse extsToDisable) formatName = many1 $ noneOf "-+" diff --git a/src/Text/Pandoc/Filter.hs b/src/Text/Pandoc/Filter.hs index 6d4846f98..1209ceeb7 100644 --- a/src/Text/Pandoc/Filter.hs +++ b/src/Text/Pandoc/Filter.hs @@ -4,7 +4,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Filter - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley@edu> diff --git a/src/Text/Pandoc/Filter/JSON.hs b/src/Text/Pandoc/Filter/JSON.hs index 83ec9a97c..d2323fac4 100644 --- a/src/Text/Pandoc/Filter/JSON.hs +++ b/src/Text/Pandoc/Filter/JSON.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Filter - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley@edu> diff --git a/src/Text/Pandoc/Filter/Lua.hs b/src/Text/Pandoc/Filter/Lua.hs index a76c8da2f..c238e53d9 100644 --- a/src/Text/Pandoc/Filter/Lua.hs +++ b/src/Text/Pandoc/Filter/Lua.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Filter.Lua - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley@edu> diff --git a/src/Text/Pandoc/Filter/Path.hs b/src/Text/Pandoc/Filter/Path.hs index 9e5e8fa77..1d16c53b9 100644 --- a/src/Text/Pandoc/Filter/Path.hs +++ b/src/Text/Pandoc/Filter/Path.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Filter.Path - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley@edu> diff --git a/src/Text/Pandoc/Highlighting.hs b/src/Text/Pandoc/Highlighting.hs index ce8880f84..62a261e50 100644 --- a/src/Text/Pandoc/Highlighting.hs +++ b/src/Text/Pandoc/Highlighting.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Highlighting - Copyright : Copyright (C) 2008-2020 John MacFarlane + Copyright : Copyright (C) 2008-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -52,12 +52,12 @@ highlightingStyles = ("breezedark", breezeDark), ("haddock", haddock)] -languages :: [T.Text] -languages = [T.toLower (sName s) | s <- M.elems defaultSyntaxMap] +languages :: SyntaxMap -> [T.Text] +languages syntaxmap = [T.toLower (sName s) | s <- M.elems syntaxmap] -languagesByExtension :: T.Text -> [T.Text] -languagesByExtension ext = - [T.toLower (sName s) | s <- syntaxesByExtension defaultSyntaxMap (T.unpack ext)] +languagesByExtension :: SyntaxMap -> T.Text -> [T.Text] +languagesByExtension syntaxmap ext = + [T.toLower (sName s) | s <- syntaxesByExtension syntaxmap (T.unpack ext)] highlight :: SyntaxMap -> (FormatOptions -> [SourceLine] -> a) -- ^ Formatter diff --git a/src/Text/Pandoc/Image.hs b/src/Text/Pandoc/Image.hs index e37de4e00..e0c938938 100644 --- a/src/Text/Pandoc/Image.hs +++ b/src/Text/Pandoc/Image.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings, ScopedTypeVariables, CPP #-} {- | Module : Text.Pandoc.Image -Copyright : Copyright (C) 2020 John MacFarlane +Copyright : Copyright (C) 2020-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/ImageSize.hs b/src/Text/Pandoc/ImageSize.hs index fc9e1854b..2b7d10611 100644 --- a/src/Text/Pandoc/ImageSize.hs +++ b/src/Text/Pandoc/ImageSize.hs @@ -1,9 +1,9 @@ -{-# LANGUAGE OverloadedStrings, ScopedTypeVariables, CPP #-} +{-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-} {-# LANGUAGE ViewPatterns #-} {-# OPTIONS_GHC -fno-warn-type-defaults #-} {- | Module : Text.Pandoc.ImageSize -Copyright : Copyright (C) 2011-2020 John MacFarlane +Copyright : Copyright (C) 2011-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -32,33 +32,33 @@ module Text.Pandoc.ImageSize ( ImageType(..) , showInPixel , showFl ) where -import Data.ByteString (ByteString, unpack) +import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as B import qualified Data.ByteString.Lazy as BL +import Data.Binary.Get import Data.Char (isDigit) import Control.Monad -import Data.Bits -import Data.Binary -import Data.Binary.Get import Text.Pandoc.Shared (safeRead) import Data.Default (Default) import Numeric (showFFloat) import Text.Pandoc.Definition import Text.Pandoc.Options import qualified Text.Pandoc.UTF8 as UTF8 -import qualified Text.XML.Light as Xml -import qualified Data.Map as M +import Text.Pandoc.XML.Light hiding (Attr) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL import qualified Data.Text.Encoding as TE -import Control.Monad.Except import Control.Applicative -import Data.Maybe (fromMaybe) import qualified Data.Attoparsec.ByteString.Char8 as A +import qualified Codec.Picture.Metadata as Metadata +import qualified Codec.Picture.Metadata.Exif as Exif +import Codec.Picture (decodeImageWithMetadata) -- quick and dirty functions to get image sizes -- algorithms borrowed from wwwis.pl -data ImageType = Png | Gif | Jpeg | Svg | Pdf | Eps | Emf deriving Show +data ImageType = Png | Gif | Jpeg | Svg | Pdf | Eps | Emf | Tiff + deriving Show data Direction = Width | Height instance Show Direction where show Width = "width" @@ -101,6 +101,8 @@ imageType :: ByteString -> Maybe ImageType imageType img = case B.take 4 img of "\x89\x50\x4e\x47" -> return Png "\x47\x49\x46\x38" -> return Gif + "\x49\x49\x2a\x00" -> return Tiff + "\x4D\x4D\x00\x2a" -> return Tiff "\xff\xd8\xff\xe0" -> return Jpeg -- JFIF "\xff\xd8\xff\xe1" -> return Jpeg -- Exif "%PDF" -> return Pdf @@ -122,9 +124,10 @@ findSvgTag img = "<svg" `B.isInfixOf` img || "<SVG" `B.isInfixOf` img imageSize :: WriterOptions -> ByteString -> Either T.Text ImageSize imageSize opts img = checkDpi <$> case imageType img of - Just Png -> mbToEither "could not determine PNG size" $ pngSize img - Just Gif -> mbToEither "could not determine GIF size" $ gifSize img - Just Jpeg -> jpegSize img + Just Png -> getSize img + Just Gif -> getSize img + Just Jpeg -> getSize img + Just Tiff -> getSize img Just Svg -> mbToEither "could not determine SVG size" $ svgSize opts img Just Eps -> mbToEither "could not determine EPS size" $ epsSize img Just Pdf -> mbToEither "could not determine PDF size" $ pdfSize img @@ -139,9 +142,6 @@ imageSize opts img = checkDpi <$> , dpiY = if dpiY size == 0 then 72 else dpiY size } -defaultSize :: (Integer, Integer) -defaultSize = (72, 72) - sizeInPixels :: ImageSize -> (Integer, Integer) sizeInPixels s = (pxX s, pxY s) @@ -300,59 +300,50 @@ pPdfSize = do , dpiY = 72 } ) <|> pPdfSize -pngSize :: ByteString -> Maybe ImageSize -pngSize img = do - let (h, rest) = B.splitAt 8 img - guard $ h == "\x8a\x4d\x4e\x47\x0d\x0a\x1a\x0a" || - h == "\x89\x50\x4e\x47\x0d\x0a\x1a\x0a" - let (i, rest') = B.splitAt 4 $ B.drop 4 rest - guard $ i == "MHDR" || i == "IHDR" - let (sizes, rest'') = B.splitAt 8 rest' - (x,y) <- case map fromIntegral $unpack sizes of - ([w1,w2,w3,w4,h1,h2,h3,h4] :: [Integer]) -> return - (shift w1 24 + shift w2 16 + shift w3 8 + w4, - shift h1 24 + shift h2 16 + shift h3 8 + h4) - _ -> Nothing -- "PNG parse error" - (dpix, dpiy) <- findpHYs rest'' - return ImageSize { pxX = x, pxY = y, dpiX = dpix, dpiY = dpiy } - -findpHYs :: ByteString -> Maybe (Integer, Integer) -findpHYs x - | B.null x || "IDAT" `B.isPrefixOf` x = return (72,72) - | "pHYs" `B.isPrefixOf` x = - case map fromIntegral $ unpack $ B.take 9 $ B.drop 4 x of - [x1,x2,x3,x4,y1,y2,y3,y4,u] -> do - let factor = if u == 1 -- dots per meter - then \z -> z * 254 `div` 10000 - else const 72 - return - ( factor $ shift x1 24 + shift x2 16 + shift x3 8 + x4, - factor $ shift y1 24 + shift y2 16 + shift y3 8 + y4 ) - _ -> mzero - | otherwise = findpHYs $ B.drop 1 x -- read another byte - -gifSize :: ByteString -> Maybe ImageSize -gifSize img = do - let (h, rest) = B.splitAt 6 img - guard $ h == "GIF87a" || h == "GIF89a" - case map fromIntegral $ unpack $ B.take 4 rest of - [w2,w1,h2,h1] -> return ImageSize { - pxX = shift w1 8 + w2, - pxY = shift h1 8 + h2, - dpiX = 72, - dpiY = 72 - } - _ -> Nothing -- "GIF parse error" +getSize :: ByteString -> Either T.Text ImageSize +getSize img = + case decodeImageWithMetadata img of + Left e -> Left (T.pack e) + Right (_, meta) -> do + pxx <- maybe (Left "Could not determine width") Right $ + -- first look for exif image width, then width + (Metadata.lookup + (Metadata.Exif (Exif.TagUnknown 0xA002)) meta >>= + exifDataToWord) <|> + Metadata.lookup Metadata.Width meta + pxy <- maybe (Left "Could not determine height") Right $ + -- first look for exif image height, then height + (Metadata.lookup + (Metadata.Exif (Exif.TagUnknown 0xA003)) meta >>= + exifDataToWord) <|> + Metadata.lookup Metadata.Height meta + dpix <- maybe (Right 72) Right $ Metadata.lookup Metadata.DpiX meta + dpiy <- maybe (Right 72) Right $ Metadata.lookup Metadata.DpiY meta + return $ ImageSize + { pxX = fromIntegral pxx + , pxY = fromIntegral pxy + , dpiX = fromIntegral dpix + , dpiY = fromIntegral dpiy } + where + exifDataToWord (Exif.ExifLong x) = Just $ fromIntegral x + exifDataToWord (Exif.ExifShort x) = Just $ fromIntegral x + exifDataToWord _ = Nothing + svgSize :: WriterOptions -> ByteString -> Maybe ImageSize svgSize opts img = do - doc <- Xml.parseXMLDoc $ UTF8.toString img + doc <- either (const mzero) return $ parseXMLElement + $ TL.fromStrict $ UTF8.toText img + let viewboxSize = do + vb <- findAttrBy (== QName "viewBox" Nothing Nothing) doc + [_,_,w,h] <- mapM safeRead (T.words vb) + return (w,h) let dpi = fromIntegral $ writerDpi opts let dirToInt dir = do - dim <- Xml.findAttrBy (== Xml.QName dir Nothing Nothing) doc >>= lengthToDim . T.pack + dim <- findAttrBy (== QName dir Nothing Nothing) doc >>= lengthToDim return $ inPixel opts dim - w <- dirToInt "width" - h <- dirToInt "height" + w <- dirToInt "width" <|> (fst <$> viewboxSize) + h <- dirToInt "height" <|> (snd <$> viewboxSize) return ImageSize { pxX = w , pxY = h @@ -390,280 +381,3 @@ emfSize img = case parseheader . BL.fromStrict $ img of Left _ -> Nothing Right (_, _, size) -> Just size - - -jpegSize :: ByteString -> Either T.Text ImageSize -jpegSize img = - let (hdr, rest) = B.splitAt 4 img - in if B.length rest < 14 - then Left "unable to determine JPEG size" - else case hdr of - "\xff\xd8\xff\xe0" -> jfifSize rest - "\xff\xd8\xff\xe1" -> exifSize rest - _ -> Left "unable to determine JPEG size" - -jfifSize :: ByteString -> Either T.Text ImageSize -jfifSize rest = - case map fromIntegral $ unpack $ B.take 5 $ B.drop 9 rest of - [dpiDensity,dpix1,dpix2,dpiy1,dpiy2] -> - let factor = case dpiDensity of - 1 -> id - 2 -> \x -> x * 254 `div` 10 - _ -> const 72 - dpix = factor (shift dpix1 8 + dpix2) - dpiy = factor (shift dpiy1 8 + dpiy2) - in case findJfifSize rest of - Left msg -> Left msg - Right (w,h) -> Right ImageSize { pxX = w - , pxY = h - , dpiX = dpix - , dpiY = dpiy } - _ -> Left "unable to determine JFIF size" - -findJfifSize :: ByteString -> Either T.Text (Integer,Integer) -findJfifSize bs = - let bs' = B.dropWhile (=='\xff') $ B.dropWhile (/='\xff') bs - in case B.uncons bs' of - Just (c,bs'') | c >= '\xc0' && c <= '\xc3' -> - case map fromIntegral $ unpack $ B.take 4 $ B.drop 3 bs'' of - [h1,h2,w1,w2] -> Right (shift w1 8 + w2, shift h1 8 + h2) - _ -> Left "JFIF parse error" - Just (_,bs'') -> - case map fromIntegral $ unpack $ B.take 2 bs'' of - [c1,c2] -> - let len = shift c1 8 + c2 - -- skip variables - in findJfifSize $ B.drop len bs'' - _ -> Left "JFIF parse error" - Nothing -> Left "Did not find JFIF length record" - -runGet' :: Get (Either T.Text a) -> BL.ByteString -> Either T.Text a -runGet' p bl = -#if MIN_VERSION_binary(0,7,0) - case runGetOrFail p bl of - Left (_,_,msg) -> Left $ T.pack msg - Right (_,_,x) -> x -#else - runGet p bl -#endif - -exifSize :: ByteString -> Either T.Text ImageSize -exifSize bs = runGet' header bl - where bl = BL.fromChunks [bs] - header = runExceptT $ exifHeader bl --- NOTE: It would be nicer to do --- runGet ((Just <$> exifHeader) <|> return Nothing) --- which would prevent pandoc from raising an error when an exif header can't --- be parsed. But we only get an Alternative instance for Get in binary 0.6, --- and binary 0.5 ships with ghc 7.6. - -exifHeader :: BL.ByteString -> ExceptT T.Text Get ImageSize -exifHeader hdr = do - _app1DataSize <- lift getWord16be - exifHdr <- lift getWord32be - unless (exifHdr == 0x45786966) $ throwError "Did not find exif header" - zeros <- lift getWord16be - unless (zeros == 0) $ throwError "Expected zeros after exif header" - -- beginning of tiff header -- we read whole thing to use - -- in getting data from offsets: - let tiffHeader = BL.drop 8 hdr - byteAlign <- lift getWord16be - let bigEndian = byteAlign == 0x4d4d - let (getWord16, getWord32, getWord64) = - if bigEndian - then (getWord16be, getWord32be, getWord64be) - else (getWord16le, getWord32le, getWord64le) - let getRational = do - num <- getWord32 - den <- getWord32 - return $ fromIntegral num / fromIntegral den - tagmark <- lift getWord16 - unless (tagmark == 0x002a) $ throwError "Failed alignment sanity check" - ifdOffset <- lift getWord32 - lift $ skip (fromIntegral ifdOffset - 8) -- skip to IDF - numentries <- lift getWord16 - let ifdEntry :: ExceptT T.Text Get (TagType, DataFormat) - ifdEntry = do - tag <- fromMaybe UnknownTagType . flip M.lookup tagTypeTable - <$> lift getWord16 - dataFormat <- lift getWord16 - numComponents <- lift getWord32 - (fmt, bytesPerComponent) <- - case dataFormat of - 1 -> return (UnsignedByte <$> getWord8, 1) - 2 -> return (AsciiString <$> - getLazyByteString - (fromIntegral numComponents), 1) - 3 -> return (UnsignedShort <$> getWord16, 2) - 4 -> return (UnsignedLong <$> getWord32, 4) - 5 -> return (UnsignedRational <$> getRational, 8) - 6 -> return (SignedByte <$> getWord8, 1) - 7 -> return (Undefined <$> getLazyByteString - (fromIntegral numComponents), 1) - 8 -> return (SignedShort <$> getWord16, 2) - 9 -> return (SignedLong <$> getWord32, 4) - 10 -> return (SignedRational <$> getRational, 8) - 11 -> return (SingleFloat <$> getWord32 {- TODO -}, 4) - 12 -> return (DoubleFloat <$> getWord64 {- TODO -}, 8) - _ -> throwError $ "Unknown data format " <> T.pack (show dataFormat) - let totalBytes = fromIntegral $ numComponents * bytesPerComponent - payload <- if totalBytes <= 4 -- data is right here - then lift $ fmt <* skip (4 - totalBytes) - else do -- get data from offset - offs <- lift getWord32 - let bytesAtOffset = - BL.take (fromIntegral totalBytes) - $ BL.drop (fromIntegral offs) tiffHeader - case runGet' (Right <$> fmt) bytesAtOffset of - Left msg -> throwError msg - Right x -> return x - return (tag, payload) - entries <- replicateM (fromIntegral numentries) ifdEntry - subentries <- case lookup ExifOffset entries of - Just (UnsignedLong offset') -> do - pos <- lift bytesRead - lift $ skip (fromIntegral offset' - (fromIntegral pos - 8)) - numsubentries <- lift getWord16 - replicateM (fromIntegral numsubentries) ifdEntry - _ -> return [] - let allentries = entries ++ subentries - (wdth, hght) <- case (lookup ExifImageWidth allentries, - lookup ExifImageHeight allentries) of - (Just (UnsignedLong w), Just (UnsignedLong h)) -> - return (fromIntegral w, fromIntegral h) - _ -> return defaultSize - -- we return a default width and height when - -- the exif header doesn't contain these - let resfactor = case lookup ResolutionUnit allentries of - Just (UnsignedShort 1) -> 100 / 254 - _ -> 1 - let xres = case lookup XResolution allentries of - Just (UnsignedRational x) -> floor (x * resfactor) - _ -> 72 - let yres = case lookup YResolution allentries of - Just (UnsignedRational y) -> floor (y * resfactor) - _ -> 72 - return ImageSize{ - pxX = wdth - , pxY = hght - , dpiX = xres - , dpiY = yres } - -data DataFormat = UnsignedByte Word8 - | AsciiString BL.ByteString - | UnsignedShort Word16 - | UnsignedLong Word32 - | UnsignedRational Rational - | SignedByte Word8 - | Undefined BL.ByteString - | SignedShort Word16 - | SignedLong Word32 - | SignedRational Rational - | SingleFloat Word32 - | DoubleFloat Word64 - deriving (Show) - -data TagType = ImageDescription - | Make - | Model - | Orientation - | XResolution - | YResolution - | ResolutionUnit - | Software - | DateTime - | WhitePoint - | PrimaryChromaticities - | YCbCrCoefficients - | YCbCrPositioning - | ReferenceBlackWhite - | Copyright - | ExifOffset - | ExposureTime - | FNumber - | ExposureProgram - | ISOSpeedRatings - | ExifVersion - | DateTimeOriginal - | DateTimeDigitized - | ComponentConfiguration - | CompressedBitsPerPixel - | ShutterSpeedValue - | ApertureValue - | BrightnessValue - | ExposureBiasValue - | MaxApertureValue - | SubjectDistance - | MeteringMode - | LightSource - | Flash - | FocalLength - | MakerNote - | UserComment - | FlashPixVersion - | ColorSpace - | ExifImageWidth - | ExifImageHeight - | RelatedSoundFile - | ExifInteroperabilityOffset - | FocalPlaneXResolution - | FocalPlaneYResolution - | FocalPlaneResolutionUnit - | SensingMethod - | FileSource - | SceneType - | UnknownTagType - deriving (Show, Eq, Ord) - -tagTypeTable :: M.Map Word16 TagType -tagTypeTable = M.fromList - [ (0x010e, ImageDescription) - , (0x010f, Make) - , (0x0110, Model) - , (0x0112, Orientation) - , (0x011a, XResolution) - , (0x011b, YResolution) - , (0x0128, ResolutionUnit) - , (0x0131, Software) - , (0x0132, DateTime) - , (0x013e, WhitePoint) - , (0x013f, PrimaryChromaticities) - , (0x0211, YCbCrCoefficients) - , (0x0213, YCbCrPositioning) - , (0x0214, ReferenceBlackWhite) - , (0x8298, Copyright) - , (0x8769, ExifOffset) - , (0x829a, ExposureTime) - , (0x829d, FNumber) - , (0x8822, ExposureProgram) - , (0x8827, ISOSpeedRatings) - , (0x9000, ExifVersion) - , (0x9003, DateTimeOriginal) - , (0x9004, DateTimeDigitized) - , (0x9101, ComponentConfiguration) - , (0x9102, CompressedBitsPerPixel) - , (0x9201, ShutterSpeedValue) - , (0x9202, ApertureValue) - , (0x9203, BrightnessValue) - , (0x9204, ExposureBiasValue) - , (0x9205, MaxApertureValue) - , (0x9206, SubjectDistance) - , (0x9207, MeteringMode) - , (0x9208, LightSource) - , (0x9209, Flash) - , (0x920a, FocalLength) - , (0x927c, MakerNote) - , (0x9286, UserComment) - , (0xa000, FlashPixVersion) - , (0xa001, ColorSpace) - , (0xa002, ExifImageWidth) - , (0xa003, ExifImageHeight) - , (0xa004, RelatedSoundFile) - , (0xa005, ExifInteroperabilityOffset) - , (0xa20e, FocalPlaneXResolution) - , (0xa20f, FocalPlaneYResolution) - , (0xa210, FocalPlaneResolutionUnit) - , (0xa217, SensingMethod) - , (0xa300, FileSource) - , (0xa301, SceneType) - ] diff --git a/src/Text/Pandoc/Logging.hs b/src/Text/Pandoc/Logging.hs index 825fdaadb..193b8b61c 100644 --- a/src/Text/Pandoc/Logging.hs +++ b/src/Text/Pandoc/Logging.hs @@ -4,7 +4,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Logging - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -36,6 +36,7 @@ import Data.Typeable (Typeable) import GHC.Generics (Generic) import Text.Pandoc.Definition import Text.Parsec.Pos +import Text.Pandoc.Shared (tshow) -- | Verbosity level. data Verbosity = ERROR | WARNING | INFO @@ -84,6 +85,7 @@ data LogMessage = | CouldNotParseCSS Text | Fetching Text | Extracting Text + | LoadedResource FilePath FilePath | NoTitleElement Text | NoLangSpecified | InvalidLang Text @@ -100,6 +102,8 @@ data LogMessage = | FilterCompleted FilePath Integer | CiteprocWarning Text | ATXHeadingInLHS Int Text + | EnvironmentVariableUndefined Text + | DuplicateAttribute Text Text deriving (Show, Eq, Data, Ord, Typeable, Generic) instance ToJSON LogMessage where @@ -192,6 +196,9 @@ instance ToJSON LogMessage where ["path" .= fp] Extracting fp -> ["path" .= fp] + LoadedResource orig found -> + ["for" .= orig + ,"from" .= found] NoTitleElement fallback -> ["fallback" .= fallback] NoLangSpecified -> [] @@ -229,13 +236,20 @@ instance ToJSON LogMessage where ATXHeadingInLHS lvl contents -> ["level" .= lvl ,"contents" .= contents] + EnvironmentVariableUndefined var -> + ["variable" .= var ] + DuplicateAttribute attr val -> + ["attribute" .= attr + ,"value" .= val] showPos :: SourcePos -> Text showPos pos = Text.pack $ sn ++ "line " ++ show (sourceLine pos) ++ " column " ++ show (sourceColumn pos) - where sn = if sourceName pos == "source" || sourceName pos == "" - then "" - else sourceName pos ++ " " + where + sn' = sourceName pos + sn = if sn' == "source" || sn' == "" || sn' == "-" + then "" + else sn' ++ " " encodeLogMessages :: [LogMessage] -> BL.ByteString encodeLogMessages ms = @@ -268,7 +282,7 @@ showLogMessage msg = ParsingUnescaped s pos -> "Parsing unescaped '" <> s <> "' at " <> showPos pos CouldNotLoadIncludeFile fp pos -> - "Could not load include file '" <> fp <> "' at " <> showPos pos + "Could not load include file " <> fp <> " at " <> showPos pos MacroAlreadyDefined name pos -> "Macro '" <> name <> "' already defined, ignoring at " <> showPos pos InlineNotRendered il -> @@ -280,18 +294,18 @@ showLogMessage msg = IgnoredIOError s -> "IO Error (ignored): " <> s CouldNotFetchResource fp s -> - "Could not fetch resource '" <> fp <> "'" <> + "Could not fetch resource " <> fp <> if Text.null s then "" else ": " <> s CouldNotDetermineImageSize fp s -> - "Could not determine image size for '" <> fp <> "'" <> + "Could not determine image size for " <> fp <> if Text.null s then "" else ": " <> s CouldNotConvertImage fp s -> - "Could not convert image '" <> fp <> "'" <> + "Could not convert image " <> fp <> if Text.null s then "" else ": " <> s CouldNotDetermineMimeType fp -> - "Could not determine mime type for '" <> fp <> "'" + "Could not determine mime type for " <> fp CouldNotConvertTeXMath s m -> - "Could not convert TeX math '" <> s <> "', rendering as TeX" <> + "Could not convert TeX math " <> s <> ", rendering as TeX" <> if Text.null m then "" else ":\n" <> m CouldNotParseCSS m -> "Could not parse CSS" <> if Text.null m then "" else ":\n" <> m @@ -299,6 +313,8 @@ showLogMessage msg = "Fetching " <> fp <> "..." Extracting fp -> "Extracting " <> fp <> "..." + LoadedResource orig found -> + "Loaded " <> Text.pack orig <> " from " <> Text.pack found NoTitleElement fallback -> "This document format requires a nonempty <title> element.\n" <> "Defaulting to '" <> fallback <> "' as the title.\n" <> @@ -345,6 +361,10 @@ showLogMessage msg = if lvl < 3 then " Consider using --markdown-headings=setext." else "" + EnvironmentVariableUndefined var -> + "Undefined environment variable " <> var <> " in defaults file." + DuplicateAttribute attr val -> + "Ignoring duplicate attribute " <> attr <> "=" <> tshow val <> "." messageVerbosity :: LogMessage -> Verbosity messageVerbosity msg = @@ -375,6 +395,7 @@ messageVerbosity msg = CouldNotParseCSS{} -> WARNING Fetching{} -> INFO Extracting{} -> INFO + LoadedResource{} -> INFO NoTitleElement{} -> WARNING NoLangSpecified -> INFO InvalidLang{} -> WARNING @@ -391,3 +412,5 @@ messageVerbosity msg = FilterCompleted{} -> INFO CiteprocWarning{} -> WARNING ATXHeadingInLHS{} -> WARNING + EnvironmentVariableUndefined{}-> WARNING + DuplicateAttribute{} -> WARNING diff --git a/src/Text/Pandoc/Lua.hs b/src/Text/Pandoc/Lua.hs index 39db0074a..f0e9e076b 100644 --- a/src/Text/Pandoc/Lua.hs +++ b/src/Text/Pandoc/Lua.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Lua - Copyright : Copyright © 2017–2020 Albert Krewinkel + Copyright : Copyright © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Lua/ErrorConversion.hs b/src/Text/Pandoc/Lua/ErrorConversion.hs index 59c962723..4e6880722 100644 --- a/src/Text/Pandoc/Lua/ErrorConversion.hs +++ b/src/Text/Pandoc/Lua/ErrorConversion.hs @@ -3,7 +3,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Lua.ErrorConversion - Copyright : © 2020 Albert Krewinkel + Copyright : © 2020-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Lua/Filter.hs b/src/Text/Pandoc/Lua/Filter.hs index 94d7adeb2..01bf90efa 100644 --- a/src/Text/Pandoc/Lua/Filter.hs +++ b/src/Text/Pandoc/Lua/Filter.hs @@ -1,8 +1,8 @@ {-# LANGUAGE FlexibleContexts #-} {- | Module : Text.Pandoc.Lua.Filter -Copyright : © 2012–2020 John MacFarlane, - © 2017-2020 Albert Krewinkel +Copyright : © 2012-2021 John MacFarlane, + © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> Stability : alpha @@ -13,7 +13,9 @@ module Text.Pandoc.Lua.Filter ( LuaFilterFunction , LuaFilter , runFilterFile , walkInlines + , walkInlineLists , walkBlocks + , walkBlockLists , module Text.Pandoc.Lua.Walk ) where import Control.Applicative ((<|>)) @@ -22,6 +24,7 @@ import Control.Monad.Catch (finally, try) import Data.Data (Data, DataType, dataTypeConstrs, dataTypeName, dataTypeOf, showConstr, toConstr, tyconUQname) import Data.Foldable (foldrM) +import Data.List (foldl') import Data.Map (Map) import Data.Maybe (fromMaybe) import Foreign.Lua (Lua, Peekable, Pushable, StackIndex) @@ -204,7 +207,7 @@ walkMeta lf (Pandoc m bs) = do walkPandoc :: LuaFilter -> Pandoc -> Lua Pandoc walkPandoc (LuaFilter fnMap) = - case foldl mplus Nothing (map (`Map.lookup` fnMap) pandocFilterNames) of + case foldl' mplus Nothing (map (`Map.lookup` fnMap) pandocFilterNames) of Just fn -> \x -> runFilterFunction fn x *> singleElement x Nothing -> return diff --git a/src/Text/Pandoc/Lua/Global.hs b/src/Text/Pandoc/Lua/Global.hs index 4285be662..29b788f04 100644 --- a/src/Text/Pandoc/Lua/Global.hs +++ b/src/Text/Pandoc/Lua/Global.hs @@ -1,7 +1,7 @@ {-# LANGUAGE DeriveDataTypeable #-} {- | Module : Text.Pandoc.Lua - Copyright : Copyright © 2017-2020 Albert Krewinkel + Copyright : Copyright © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Lua/Init.hs b/src/Text/Pandoc/Lua/Init.hs index e89e9d6e0..baa6f0295 100644 --- a/src/Text/Pandoc/Lua/Init.hs +++ b/src/Text/Pandoc/Lua/Init.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Lua - Copyright : Copyright © 2017-2020 Albert Krewinkel + Copyright : Copyright © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -12,17 +12,18 @@ module Text.Pandoc.Lua.Init ( runLua ) where +import Control.Monad (when) import Control.Monad.Catch (try) import Control.Monad.Trans (MonadIO (..)) import Data.Data (Data, dataTypeConstrs, dataTypeOf, showConstr) import Foreign.Lua (Lua) import GHC.IO.Encoding (getForeignEncoding, setForeignEncoding, utf8) +import Text.Pandoc.Class.PandocMonad (readDataFile) import Text.Pandoc.Class.PandocIO (PandocIO) import Text.Pandoc.Error (PandocError) import Text.Pandoc.Lua.Packages (installPandocPackageSearcher) -import Text.Pandoc.Lua.PandocLua (PandocLua, liftPandocLua, - loadScriptFromDataDir, runPandocLua) - +import Text.Pandoc.Lua.PandocLua (PandocLua, liftPandocLua, runPandocLua) +import Text.Pandoc.Lua.Util (throwTopMessageAsError') import qualified Foreign.Lua as Lua import qualified Text.Pandoc.Definition as Pandoc import qualified Text.Pandoc.Lua.Module.Pandoc as ModulePandoc @@ -44,7 +45,7 @@ initLuaState = do liftPandocLua Lua.openlibs installPandocPackageSearcher initPandocModule - loadScriptFromDataDir "init.lua" + loadInitScript "init.lua" where initPandocModule :: PandocLua () initPandocModule = do @@ -61,6 +62,15 @@ initLuaState = do -- assign module to global variable liftPandocLua $ Lua.setglobal "pandoc" + loadInitScript :: FilePath -> PandocLua () + loadInitScript scriptFile = do + script <- readDataFile scriptFile + status <- liftPandocLua $ Lua.dostring script + when (status /= Lua.OK) . liftPandocLua $ + throwTopMessageAsError' + (("Couldn't load '" ++ scriptFile ++ "'.\n") ++) + + -- | AST elements are marshaled via normal constructor functions in the -- @pandoc@ module. However, accessing Lua globals from Haskell is -- expensive (due to error handling). Accessing the Lua registry is much diff --git a/src/Text/Pandoc/Lua/Marshaling.hs b/src/Text/Pandoc/Lua/Marshaling.hs index 1254402b6..f517c7c27 100644 --- a/src/Text/Pandoc/Lua/Marshaling.hs +++ b/src/Text/Pandoc/Lua/Marshaling.hs @@ -1,7 +1,7 @@ {- | Module : Text.Pandoc.Lua.Marshaling - Copyright : © 2012-2020 John MacFarlane - © 2017-2020 Albert Krewinkel + Copyright : © 2012-2021 John MacFarlane + © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Lua/Marshaling/AST.hs b/src/Text/Pandoc/Lua/Marshaling/AST.hs index c889618c4..8e12d232c 100644 --- a/src/Text/Pandoc/Lua/Marshaling/AST.hs +++ b/src/Text/Pandoc/Lua/Marshaling/AST.hs @@ -1,9 +1,10 @@ {-# OPTIONS_GHC -fno-warn-orphans #-} +{-# LANGUAGE BangPatterns #-} {-# LANGUAGE LambdaCase #-} {- | Module : Text.Pandoc.Lua.Marshaling.AST - Copyright : © 2012-2020 John MacFarlane - © 2017-2020 Albert Krewinkel + Copyright : © 2012-2021 John MacFarlane + © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -17,6 +18,7 @@ module Text.Pandoc.Lua.Marshaling.AST ) where import Control.Applicative ((<|>)) +import Control.Monad ((<$!>)) import Foreign.Lua (Lua, Peekable, Pushable, StackIndex) import Text.Pandoc.Definition import Text.Pandoc.Error (PandocError) @@ -32,17 +34,16 @@ instance Pushable Pandoc where pushViaConstructor "Pandoc" blocks meta instance Peekable Pandoc where - peek idx = defineHowTo "get Pandoc value" $ do - blocks <- LuaUtil.rawField idx "blocks" - meta <- LuaUtil.rawField idx "meta" - return $ Pandoc meta blocks + peek idx = defineHowTo "get Pandoc value" $! Pandoc + <$!> LuaUtil.rawField idx "meta" + <*> LuaUtil.rawField idx "blocks" instance Pushable Meta where push (Meta mmap) = pushViaConstructor "Meta" mmap instance Peekable Meta where - peek idx = defineHowTo "get Meta value" $ - Meta <$> Lua.peek idx + peek idx = defineHowTo "get Meta value" $! + Meta <$!> Lua.peek idx instance Pushable MetaValue where push = pushMetaValue @@ -68,14 +69,13 @@ instance Pushable Citation where pushViaConstructor "Citation" cid mode prefix suffix noteNum hash instance Peekable Citation where - peek idx = do - id' <- LuaUtil.rawField idx "id" - prefix <- LuaUtil.rawField idx "prefix" - suffix <- LuaUtil.rawField idx "suffix" - mode <- LuaUtil.rawField idx "mode" - num <- LuaUtil.rawField idx "note_num" - hash <- LuaUtil.rawField idx "hash" - return $ Citation id' prefix suffix mode num hash + peek idx = Citation + <$!> LuaUtil.rawField idx "id" + <*> LuaUtil.rawField idx "prefix" + <*> LuaUtil.rawField idx "suffix" + <*> LuaUtil.rawField idx "mode" + <*> LuaUtil.rawField idx "note_num" + <*> LuaUtil.rawField idx "hash" instance Pushable Alignment where push = Lua.push . show @@ -90,7 +90,7 @@ instance Peekable CitationMode where instance Pushable Format where push (Format f) = Lua.push f instance Peekable Format where - peek idx = Format <$> Lua.peek idx + peek idx = Format <$!> Lua.peek idx instance Pushable ListNumberDelim where push = Lua.push . show @@ -130,26 +130,26 @@ peekMetaValue idx = defineHowTo "get MetaValue" $ do elementContent = Lua.peek idx luatype <- Lua.ltype idx case luatype of - Lua.TypeBoolean -> MetaBool <$> Lua.peek idx - Lua.TypeString -> MetaString <$> Lua.peek idx + Lua.TypeBoolean -> MetaBool <$!> Lua.peek idx + Lua.TypeString -> MetaString <$!> Lua.peek idx Lua.TypeTable -> do tag <- try $ LuaUtil.getTag idx case tag of - Right "MetaBlocks" -> MetaBlocks <$> elementContent - Right "MetaBool" -> MetaBool <$> elementContent - Right "MetaMap" -> MetaMap <$> elementContent - Right "MetaInlines" -> MetaInlines <$> elementContent - Right "MetaList" -> MetaList <$> elementContent - Right "MetaString" -> MetaString <$> elementContent + Right "MetaBlocks" -> MetaBlocks <$!> elementContent + Right "MetaBool" -> MetaBool <$!> elementContent + Right "MetaMap" -> MetaMap <$!> elementContent + Right "MetaInlines" -> MetaInlines <$!> elementContent + Right "MetaList" -> MetaList <$!> elementContent + Right "MetaString" -> MetaString <$!> elementContent Right t -> Lua.throwMessage ("Unknown meta tag: " <> t) Left _ -> do -- no meta value tag given, try to guess. len <- Lua.rawlen idx if len <= 0 - then MetaMap <$> Lua.peek idx - else (MetaInlines <$> Lua.peek idx) - <|> (MetaBlocks <$> Lua.peek idx) - <|> (MetaList <$> Lua.peek idx) + then MetaMap <$!> Lua.peek idx + else (MetaInlines <$!> Lua.peek idx) + <|> (MetaBlocks <$!> Lua.peek idx) + <|> (MetaList <$!> Lua.peek idx) _ -> Lua.throwMessage "could not get meta value" -- | Push a block element to the top of the Lua stack. @@ -174,25 +174,25 @@ pushBlock = \case -- | Return the value at the given index as block if possible. peekBlock :: StackIndex -> Lua Block -peekBlock idx = defineHowTo "get Block value" $ do +peekBlock idx = defineHowTo "get Block value" $! do tag <- LuaUtil.getTag idx case tag of - "BlockQuote" -> BlockQuote <$> elementContent - "BulletList" -> BulletList <$> elementContent - "CodeBlock" -> withAttr CodeBlock <$> elementContent - "DefinitionList" -> DefinitionList <$> elementContent - "Div" -> withAttr Div <$> elementContent + "BlockQuote" -> BlockQuote <$!> elementContent + "BulletList" -> BulletList <$!> elementContent + "CodeBlock" -> withAttr CodeBlock <$!> elementContent + "DefinitionList" -> DefinitionList <$!> elementContent + "Div" -> withAttr Div <$!> elementContent "Header" -> (\(lvl, LuaAttr attr, lst) -> Header lvl attr lst) - <$> elementContent + <$!> elementContent "HorizontalRule" -> return HorizontalRule - "LineBlock" -> LineBlock <$> elementContent + "LineBlock" -> LineBlock <$!> elementContent "OrderedList" -> (\(LuaListAttributes lstAttr, lst) -> OrderedList lstAttr lst) - <$> elementContent + <$!> elementContent "Null" -> return Null - "Para" -> Para <$> elementContent - "Plain" -> Plain <$> elementContent - "RawBlock" -> uncurry RawBlock <$> elementContent + "Para" -> Para <$!> elementContent + "Plain" -> Plain <$!> elementContent + "RawBlock" -> uncurry RawBlock <$!> elementContent "Table" -> (\(attr, capt, colSpecs, thead, tbodies, tfoot) -> Table (fromLuaAttr attr) capt @@ -200,7 +200,7 @@ peekBlock idx = defineHowTo "get Block value" $ do thead tbodies tfoot) - <$> elementContent + <$!> elementContent _ -> Lua.throwMessage ("Unknown block type: " <> tag) where -- Get the contents of an AST element. @@ -222,15 +222,14 @@ pushCaption (Caption shortCaption longCaption) = do -- | Peek Caption element peekCaption :: StackIndex -> Lua Caption -peekCaption idx = do - short <- Lua.fromOptional <$> LuaUtil.rawField idx "short" - long <- LuaUtil.rawField idx "long" - return $ Caption short long +peekCaption idx = Caption + <$!> (Lua.fromOptional <$!> LuaUtil.rawField idx "short") + <*> LuaUtil.rawField idx "long" instance Peekable ColWidth where peek idx = do - width <- Lua.fromOptional <$> Lua.peek idx - return $ maybe ColWidthDefault ColWidth width + width <- Lua.fromOptional <$!> Lua.peek idx + return $! maybe ColWidthDefault ColWidth width instance Pushable ColWidth where push = \case @@ -252,12 +251,11 @@ instance Pushable TableBody where LuaUtil.addField "body" body instance Peekable TableBody where - peek idx = do - attr <- LuaUtil.rawField idx "attr" - rowHeadColumns <- LuaUtil.rawField idx "row_head_columns" - head' <- LuaUtil.rawField idx "head" - body <- LuaUtil.rawField idx "body" - return $ TableBody attr (RowHeadColumns rowHeadColumns) head' body + peek idx = TableBody + <$!> LuaUtil.rawField idx "attr" + <*> (RowHeadColumns <$!> LuaUtil.rawField idx "row_head_columns") + <*> LuaUtil.rawField idx "head" + <*> LuaUtil.rawField idx "body" instance Pushable TableHead where push (TableHead attr rows) = Lua.push (attr, rows) @@ -287,13 +285,12 @@ pushCell (Cell attr align (RowSpan rowSpan) (ColSpan colSpan) contents) = do LuaUtil.addField "contents" contents peekCell :: StackIndex -> Lua Cell -peekCell idx = do - attr <- fromLuaAttr <$> LuaUtil.rawField idx "attr" - align <- LuaUtil.rawField idx "alignment" - rowSpan <- LuaUtil.rawField idx "row_span" - colSpan <- LuaUtil.rawField idx "col_span" - contents <- LuaUtil.rawField idx "contents" - return $ Cell attr align (RowSpan rowSpan) (ColSpan colSpan) contents +peekCell idx = Cell + <$!> (fromLuaAttr <$!> LuaUtil.rawField idx "attr") + <*> LuaUtil.rawField idx "alignment" + <*> (RowSpan <$!> LuaUtil.rawField idx "row_span") + <*> (ColSpan <$!> LuaUtil.rawField idx "col_span") + <*> LuaUtil.rawField idx "contents" -- | Push an inline element to the top of the lua stack. pushInline :: Inline -> Lua () @@ -324,28 +321,29 @@ peekInline :: StackIndex -> Lua Inline peekInline idx = defineHowTo "get Inline value" $ do tag <- LuaUtil.getTag idx case tag of - "Cite" -> uncurry Cite <$> elementContent - "Code" -> withAttr Code <$> elementContent - "Emph" -> Emph <$> elementContent - "Underline" -> Underline <$> elementContent - "Image" -> (\(LuaAttr attr, lst, tgt) -> Image attr lst tgt) - <$> elementContent - "Link" -> (\(LuaAttr attr, lst, tgt) -> Link attr lst tgt) - <$> elementContent + "Cite" -> uncurry Cite <$!> elementContent + "Code" -> withAttr Code <$!> elementContent + "Emph" -> Emph <$!> elementContent + "Underline" -> Underline <$!> elementContent + "Image" -> (\(LuaAttr !attr, !lst, !tgt) -> Image attr lst tgt) + <$!> elementContent + "Link" -> (\(LuaAttr !attr, !lst, !tgt) -> Link attr lst tgt) + <$!> elementContent "LineBreak" -> return LineBreak - "Note" -> Note <$> elementContent - "Math" -> uncurry Math <$> elementContent - "Quoted" -> uncurry Quoted <$> elementContent - "RawInline" -> uncurry RawInline <$> elementContent - "SmallCaps" -> SmallCaps <$> elementContent + "Note" -> Note <$!> elementContent + "Math" -> uncurry Math <$!> elementContent + "Quoted" -> uncurry Quoted <$!> elementContent + "RawInline" -> uncurry RawInline <$!> elementContent + "SmallCaps" -> SmallCaps <$!> elementContent "SoftBreak" -> return SoftBreak "Space" -> return Space - "Span" -> withAttr Span <$> elementContent - "Str" -> Str <$> elementContent - "Strikeout" -> Strikeout <$> elementContent - "Strong" -> Strong <$> elementContent - "Subscript" -> Subscript <$> elementContent - "Superscript"-> Superscript <$> elementContent + "Span" -> withAttr Span <$!> elementContent + -- strict to Lua string is copied before gc + "Str" -> Str <$!> elementContent + "Strikeout" -> Strikeout <$!> elementContent + "Strong" -> Strong <$!> elementContent + "Subscript" -> Subscript <$!> elementContent + "Superscript"-> Superscript <$!> elementContent _ -> Lua.throwMessage ("Unknown inline type: " <> tag) where -- Get the contents of an AST element. @@ -366,7 +364,7 @@ instance Pushable LuaAttr where pushViaConstructor "Attr" id' classes kv instance Peekable LuaAttr where - peek idx = defineHowTo "get Attr value" (LuaAttr <$> Lua.peek idx) + peek idx = defineHowTo "get Attr value" $! (LuaAttr <$!> Lua.peek idx) -- | Wrapper for ListAttributes newtype LuaListAttributes = LuaListAttributes ListAttributes diff --git a/src/Text/Pandoc/Lua/Marshaling/AnyValue.hs b/src/Text/Pandoc/Lua/Marshaling/AnyValue.hs index c4720aedf..82e26b963 100644 --- a/src/Text/Pandoc/Lua/Marshaling/AnyValue.hs +++ b/src/Text/Pandoc/Lua/Marshaling/AnyValue.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Lua.Marshaling.AnyValue - Copyright : © 2017-2020 Albert Krewinkel + Copyright : © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Lua/Marshaling/CommonState.hs b/src/Text/Pandoc/Lua/Marshaling/CommonState.hs index 636650af3..147197c5d 100644 --- a/src/Text/Pandoc/Lua/Marshaling/CommonState.hs +++ b/src/Text/Pandoc/Lua/Marshaling/CommonState.hs @@ -3,8 +3,8 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Lua.Marshaling.CommonState - Copyright : © 2012-2020 John MacFarlane - © 2017-2020 Albert Krewinkel + Copyright : © 2012-2021 John MacFarlane + © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> Stability : alpha diff --git a/src/Text/Pandoc/Lua/Marshaling/Context.hs b/src/Text/Pandoc/Lua/Marshaling/Context.hs index effcc675d..606bdcfb2 100644 --- a/src/Text/Pandoc/Lua/Marshaling/Context.hs +++ b/src/Text/Pandoc/Lua/Marshaling/Context.hs @@ -1,8 +1,8 @@ {-# OPTIONS_GHC -fno-warn-orphans #-} {- | Module : Text.Pandoc.Lua.Marshaling.Context - Copyright : © 2012-2020 John MacFarlane - © 2017-2020 Albert Krewinkel + Copyright : © 2012-2021 John MacFarlane + © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -22,6 +22,7 @@ instance (TemplateTarget a, Pushable a) => Pushable (Context a) where instance (TemplateTarget a, Pushable a) => Pushable (Val a) where push NullVal = Lua.push () + push (BoolVal b) = Lua.push b push (MapVal ctx) = Lua.push ctx push (ListVal xs) = Lua.push xs push (SimpleVal d) = Lua.push $ render Nothing d diff --git a/src/Text/Pandoc/Lua/Marshaling/List.hs b/src/Text/Pandoc/Lua/Marshaling/List.hs index e6614400d..0446302a1 100644 --- a/src/Text/Pandoc/Lua/Marshaling/List.hs +++ b/src/Text/Pandoc/Lua/Marshaling/List.hs @@ -4,8 +4,8 @@ {-# LANGUAGE UndecidableInstances #-} {- | Module : Text.Pandoc.Lua.Marshaling.List -Copyright : © 2012-2020 John MacFarlane - © 2017-2020 Albert Krewinkel +Copyright : © 2012-2021 John MacFarlane + © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> Stability : alpha diff --git a/src/Text/Pandoc/Lua/Marshaling/MediaBag.hs b/src/Text/Pandoc/Lua/Marshaling/MediaBag.hs index 2cf5b8893..70bd010a0 100644 --- a/src/Text/Pandoc/Lua/Marshaling/MediaBag.hs +++ b/src/Text/Pandoc/Lua/Marshaling/MediaBag.hs @@ -1,7 +1,7 @@ {- | Module : Text.Pandoc.Lua.Marshaling.MediaBag - Copyright : © 2012-2020 John MacFarlane - © 2017-2020 Albert Krewinkel + Copyright : © 2012-2021 John MacFarlane + © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> Stability : alpha diff --git a/src/Text/Pandoc/Lua/Marshaling/PandocError.hs b/src/Text/Pandoc/Lua/Marshaling/PandocError.hs index 74537a1dd..f698704e0 100644 --- a/src/Text/Pandoc/Lua/Marshaling/PandocError.hs +++ b/src/Text/Pandoc/Lua/Marshaling/PandocError.hs @@ -4,7 +4,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Lua.Marshaling.PandocError - Copyright : © 2020 Albert Krewinkel + Copyright : © 2020-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Lua/Marshaling/ReaderOptions.hs b/src/Text/Pandoc/Lua/Marshaling/ReaderOptions.hs index 2e45affe4..dd7bf2e61 100644 --- a/src/Text/Pandoc/Lua/Marshaling/ReaderOptions.hs +++ b/src/Text/Pandoc/Lua/Marshaling/ReaderOptions.hs @@ -4,8 +4,8 @@ {-# OPTIONS_GHC -fno-warn-orphans #-} {- | Module : Text.Pandoc.Lua.Marshaling.ReaderOptions - Copyright : © 2012-2020 John MacFarlane - © 2017-2020 Albert Krewinkel + Copyright : © 2012-2021 John MacFarlane + © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Lua/Marshaling/SimpleTable.hs b/src/Text/Pandoc/Lua/Marshaling/SimpleTable.hs index 98fa1efa4..6d43039fa 100644 --- a/src/Text/Pandoc/Lua/Marshaling/SimpleTable.hs +++ b/src/Text/Pandoc/Lua/Marshaling/SimpleTable.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Lua.Marshaling.SimpleTable - Copyright : © 2020 Albert Krewinkel + Copyright : © 2020-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Lua/Marshaling/Version.hs b/src/Text/Pandoc/Lua/Marshaling/Version.hs index 9adb1b763..4f4ffac51 100644 --- a/src/Text/Pandoc/Lua/Marshaling/Version.hs +++ b/src/Text/Pandoc/Lua/Marshaling/Version.hs @@ -4,7 +4,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Lua.Marshaling.Version - Copyright : © 2019-2020 Albert Krewinkel + Copyright : © 2019-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Lua/Module/MediaBag.hs b/src/Text/Pandoc/Lua/Module/MediaBag.hs index e5a10217a..3eed50fca 100644 --- a/src/Text/Pandoc/Lua/Module/MediaBag.hs +++ b/src/Text/Pandoc/Lua/Module/MediaBag.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Lua.Module.MediaBag - Copyright : Copyright © 2017-2020 Albert Krewinkel + Copyright : Copyright © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -13,6 +13,7 @@ module Text.Pandoc.Lua.Module.MediaBag ( pushModule ) where +import Prelude hiding (lookup) import Control.Monad (zipWithM_) import Foreign.Lua (Lua, NumResults, Optional) import Text.Pandoc.Class.CommonState (CommonState (..)) @@ -36,10 +37,10 @@ pushModule = do liftPandocLua Lua.newtable addFunction "delete" delete addFunction "empty" empty - addFunction "insert" insertMediaFn + addFunction "insert" insert addFunction "items" items - addFunction "lookup" lookupMediaFn - addFunction "list" mediaDirectoryFn + addFunction "lookup" lookup + addFunction "list" list addFunction "fetch" fetch return 1 @@ -53,11 +54,11 @@ empty :: PandocLua NumResults empty = 0 <$ modifyCommonState (\st -> st { stMediaBag = mempty }) -- | Insert a new item into the media bag. -insertMediaFn :: FilePath - -> Optional MimeType - -> BL.ByteString - -> PandocLua NumResults -insertMediaFn fp optionalMime contents = do +insert :: FilePath + -> Optional MimeType + -> BL.ByteString + -> PandocLua NumResults +insert fp optionalMime contents = do mb <- getMediaBag setMediaBag $ MB.insertMedia fp (Lua.fromOptional optionalMime) contents mb return (Lua.NumResults 0) @@ -66,19 +67,19 @@ insertMediaFn fp optionalMime contents = do items :: PandocLua NumResults items = getMediaBag >>= liftPandocLua . pushIterator -lookupMediaFn :: FilePath - -> PandocLua NumResults -lookupMediaFn fp = do +lookup :: FilePath + -> PandocLua NumResults +lookup fp = do res <- MB.lookupMedia fp <$> getMediaBag liftPandocLua $ case res of Nothing -> 1 <$ Lua.pushnil - Just (mimeType, contents) -> do - Lua.push mimeType - Lua.push contents + Just item -> do + Lua.push $ MB.mediaMimeType item + Lua.push $ MB.mediaContents item return 2 -mediaDirectoryFn :: PandocLua NumResults -mediaDirectoryFn = do +list :: PandocLua NumResults +list = do dirContents <- MB.mediaDirectory <$> getMediaBag liftPandocLua $ do Lua.newtable diff --git a/src/Text/Pandoc/Lua/Module/Pandoc.hs b/src/Text/Pandoc/Lua/Module/Pandoc.hs index 3886568b7..5c14b3a30 100644 --- a/src/Text/Pandoc/Lua/Module/Pandoc.hs +++ b/src/Text/Pandoc/Lua/Module/Pandoc.hs @@ -2,7 +2,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Lua.Module.Pandoc - Copyright : Copyright © 2017-2020 Albert Krewinkel + Copyright : Copyright © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -14,6 +14,7 @@ module Text.Pandoc.Lua.Module.Pandoc ( pushModule ) where +import Prelude hiding (read) import Control.Monad (when) import Control.Monad.Except (throwError) import Data.Default (Default (..)) @@ -22,10 +23,12 @@ import Foreign.Lua (Lua, NumResults, Optional, Peekable, Pushable) import System.Exit (ExitCode (..)) import Text.Pandoc.Class.PandocIO (runIO) import Text.Pandoc.Definition (Block, Inline) -import Text.Pandoc.Lua.Filter (walkInlines, walkBlocks, LuaFilter, SingletonsList (..)) +import Text.Pandoc.Lua.Filter (LuaFilter, SingletonsList (..), walkInlines, + walkInlineLists, walkBlocks, walkBlockLists) import Text.Pandoc.Lua.Marshaling () +import Text.Pandoc.Lua.Marshaling.List (List (..)) import Text.Pandoc.Lua.PandocLua (PandocLua, addFunction, liftPandocLua, - loadScriptFromDataDir) + loadDefaultModule) import Text.Pandoc.Walk (Walkable) import Text.Pandoc.Options (ReaderOptions (readerExtensions)) import Text.Pandoc.Process (pipeProcess) @@ -38,30 +41,33 @@ import qualified Foreign.Lua as Lua import qualified Text.Pandoc.Lua.Util as LuaUtil import Text.Pandoc.Error --- | Push the "pandoc" on the lua stack. Requires the `list` module to be --- loaded. +-- | Push the "pandoc" package to the Lua stack. Requires the `List` +-- module to be loadable. pushModule :: PandocLua NumResults pushModule = do - loadScriptFromDataDir "pandoc.lua" - addFunction "read" readDoc - addFunction "pipe" pipeFn - addFunction "walk_block" walkBlock - addFunction "walk_inline" walkInline + loadDefaultModule "pandoc" + addFunction "read" read + addFunction "pipe" pipe + addFunction "walk_block" walk_block + addFunction "walk_inline" walk_inline return 1 walkElement :: (Walkable (SingletonsList Inline) a, - Walkable (SingletonsList Block) a) + Walkable (SingletonsList Block) a, + Walkable (List Inline) a, + Walkable (List Block) a) => a -> LuaFilter -> PandocLua a -walkElement x f = liftPandocLua $ walkInlines f x >>= walkBlocks f +walkElement x f = liftPandocLua $ + walkInlines f x >>= walkInlineLists f >>= walkBlocks f >>= walkBlockLists f -walkInline :: Inline -> LuaFilter -> PandocLua Inline -walkInline = walkElement +walk_inline :: Inline -> LuaFilter -> PandocLua Inline +walk_inline = walkElement -walkBlock :: Block -> LuaFilter -> PandocLua Block -walkBlock = walkElement +walk_block :: Block -> LuaFilter -> PandocLua Block +walk_block = walkElement -readDoc :: T.Text -> Optional T.Text -> PandocLua NumResults -readDoc content formatSpecOrNil = liftPandocLua $ do +read :: T.Text -> Optional T.Text -> PandocLua NumResults +read content formatSpecOrNil = liftPandocLua $ do let formatSpec = fromMaybe "markdown" (Lua.fromOptional formatSpecOrNil) res <- Lua.liftIO . runIO $ getReader formatSpec >>= \(rdr,es) -> @@ -79,11 +85,11 @@ readDoc content formatSpecOrNil = liftPandocLua $ do Left e -> Lua.raiseError $ show e -- | Pipes input through a command. -pipeFn :: String - -> [String] - -> BL.ByteString - -> PandocLua NumResults -pipeFn command args input = liftPandocLua $ do +pipe :: String -- ^ path to executable + -> [String] -- ^ list of arguments + -> BL.ByteString -- ^ input passed to process via stdin + -> PandocLua NumResults +pipe command args input = liftPandocLua $ do (ec, output) <- Lua.liftIO $ pipeProcess Nothing command args input case ec of ExitSuccess -> 1 <$ Lua.push output diff --git a/src/Text/Pandoc/Lua/Module/System.hs b/src/Text/Pandoc/Lua/Module/System.hs index 04508e461..bd35babaf 100644 --- a/src/Text/Pandoc/Lua/Module/System.hs +++ b/src/Text/Pandoc/Lua/Module/System.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Lua.Module.System - Copyright : © 2019-2020 Albert Krewinkel + Copyright : © 2019-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Lua/Module/Types.hs b/src/Text/Pandoc/Lua/Module/Types.hs index 999f2e588..bb4f02c3c 100644 --- a/src/Text/Pandoc/Lua/Module/Types.hs +++ b/src/Text/Pandoc/Lua/Module/Types.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Lua.Module.Types - Copyright : © 2019-2020 Albert Krewinkel + Copyright : © 2019-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Lua/Module/Utils.hs b/src/Text/Pandoc/Lua/Module/Utils.hs index 7595b9c0f..3ec3afc26 100644 --- a/src/Text/Pandoc/Lua/Module/Utils.hs +++ b/src/Text/Pandoc/Lua/Module/Utils.hs @@ -2,7 +2,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Lua.Module.Utils - Copyright : Copyright © 2017-2020 Albert Krewinkel + Copyright : Copyright © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -146,7 +146,7 @@ from_simple_table (SimpleTable capt aligns widths head' body) = do nullAttr (Caption Nothing [Plain capt]) (zipWith (\a w -> (a, toColWidth w)) aligns widths) - (TableHead nullAttr [blockListToRow head']) + (TableHead nullAttr [blockListToRow head' | not (null head') ]) [TableBody nullAttr 0 [] $ map blockListToRow body] (TableFoot nullAttr []) return (NumResults 1) diff --git a/src/Text/Pandoc/Lua/Packages.hs b/src/Text/Pandoc/Lua/Packages.hs index 4c3b9d79d..2f1c139db 100644 --- a/src/Text/Pandoc/Lua/Packages.hs +++ b/src/Text/Pandoc/Lua/Packages.hs @@ -1,9 +1,6 @@ -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE LambdaCase #-} -{-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Lua.Packages - Copyright : Copyright © 2017-2020 Albert Krewinkel + Copyright : Copyright © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -15,15 +12,12 @@ module Text.Pandoc.Lua.Packages ( installPandocPackageSearcher ) where -import Control.Monad.Catch (try) import Control.Monad (forM_) -import Data.ByteString (ByteString) -import Foreign.Lua (Lua, NumResults) -import Text.Pandoc.Error (PandocError) -import Text.Pandoc.Class.PandocMonad (readDataFile) -import Text.Pandoc.Lua.PandocLua (PandocLua, liftPandocLua) +import Foreign.Lua (NumResults) +import Text.Pandoc.Lua.PandocLua (PandocLua, liftPandocLua, loadDefaultModule) import qualified Foreign.Lua as Lua +import qualified Foreign.Lua.Module.Path as Path import qualified Foreign.Lua.Module.Text as Text import qualified Text.Pandoc.Lua.Module.Pandoc as Pandoc import qualified Text.Pandoc.Lua.Module.MediaBag as MediaBag @@ -50,28 +44,17 @@ pandocPackageSearcher pkgName = case pkgName of "pandoc" -> pushWrappedHsFun Pandoc.pushModule "pandoc.mediabag" -> pushWrappedHsFun MediaBag.pushModule + "pandoc.path" -> pushWrappedHsFun Path.pushModule "pandoc.system" -> pushWrappedHsFun System.pushModule "pandoc.types" -> pushWrappedHsFun Types.pushModule "pandoc.utils" -> pushWrappedHsFun Utils.pushModule "text" -> pushWrappedHsFun Text.pushModule - _ -> searchPureLuaLoader + "pandoc.List" -> pushWrappedHsFun (loadDefaultModule pkgName) + _ -> reportPandocSearcherFailure where pushWrappedHsFun f = liftPandocLua $ do Lua.pushHaskellFunction f return 1 - searchPureLuaLoader = do - let filename = pkgName ++ ".lua" - try (readDataFile filename) >>= \case - Right script -> pushWrappedHsFun (loadStringAsPackage pkgName script) - Left (_ :: PandocError) -> liftPandocLua $ do - Lua.push ("\n\tno file '" ++ filename ++ "' in pandoc's datadir") - return (1 :: NumResults) - -loadStringAsPackage :: String -> ByteString -> Lua NumResults -loadStringAsPackage pkgName script = do - status <- Lua.dostring script - if status == Lua.OK - then return (1 :: NumResults) - else do - msg <- Lua.popValue - Lua.raiseError ("Error while loading `" <> pkgName <> "`.\n" <> msg) + reportPandocSearcherFailure = liftPandocLua $ do + Lua.push ("\n\t" <> pkgName <> "is not one of pandoc's default packages") + return (1 :: NumResults) diff --git a/src/Text/Pandoc/Lua/PandocLua.hs b/src/Text/Pandoc/Lua/PandocLua.hs index 6c3b410dd..750e019b6 100644 --- a/src/Text/Pandoc/Lua/PandocLua.hs +++ b/src/Text/Pandoc/Lua/PandocLua.hs @@ -9,7 +9,7 @@ {-# OPTIONS_GHC -fno-warn-orphans #-} {- | Module : Text.Pandoc.Lua.PandocLua - Copyright : Copyright © 2020 Albert Krewinkel + Copyright : Copyright © 2020-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -23,24 +23,23 @@ module Text.Pandoc.Lua.PandocLua , runPandocLua , liftPandocLua , addFunction - , loadScriptFromDataDir + , loadDefaultModule ) where -import Control.Monad (when) import Control.Monad.Catch (MonadCatch, MonadMask, MonadThrow) import Control.Monad.Except (MonadError (catchError, throwError)) import Control.Monad.IO.Class (MonadIO (liftIO)) import Foreign.Lua (Lua (..), NumResults, Pushable, ToHaskellFunction) import Text.Pandoc.Class.PandocIO (PandocIO) -import Text.Pandoc.Class.PandocMonad (PandocMonad (..), readDataFile) -import Text.Pandoc.Error (PandocError) +import Text.Pandoc.Class.PandocMonad (PandocMonad (..), readDefaultDataFile) +import Text.Pandoc.Error (PandocError (PandocLuaError)) import Text.Pandoc.Lua.Global (Global (..), setGlobals) import Text.Pandoc.Lua.ErrorConversion (errorConversion) import qualified Control.Monad.Catch as Catch +import qualified Data.Text as T import qualified Foreign.Lua as Lua import qualified Text.Pandoc.Class.IO as IO -import qualified Text.Pandoc.Lua.Util as LuaUtil -- | Type providing access to both, pandoc and Lua operations. newtype PandocLua a = PandocLua { unPandocLua :: Lua a } @@ -86,14 +85,22 @@ addFunction name fn = liftPandocLua $ do Lua.pushHaskellFunction fn Lua.rawset (-3) --- | Load a file from pandoc's data directory. -loadScriptFromDataDir :: FilePath -> PandocLua () -loadScriptFromDataDir scriptFile = do - script <- readDataFile scriptFile +-- | Load a pure Lua module included with pandoc. Leaves the result on +-- the stack and returns @NumResults 1@. +-- +-- The script is loaded from the default data directory. We do not load +-- from data directories supplied via command line, as this could cause +-- scripts to be executed even though they had not been passed explicitly. +loadDefaultModule :: String -> PandocLua NumResults +loadDefaultModule name = do + script <- readDefaultDataFile (name <> ".lua") status <- liftPandocLua $ Lua.dostring script - when (status /= Lua.OK) . liftPandocLua $ - LuaUtil.throwTopMessageAsError' - (("Couldn't load '" ++ scriptFile ++ "'.\n") ++) + if status == Lua.OK + then return (1 :: NumResults) + else do + msg <- liftPandocLua Lua.popValue + let err = "Error while loading `" <> name <> "`.\n" <> msg + throwError $ PandocLuaError (T.pack err) -- | Global variables which should always be set. defaultGlobals :: PandocIO [Global] diff --git a/src/Text/Pandoc/Lua/Util.hs b/src/Text/Pandoc/Lua/Util.hs index fbd013801..70a8a6d47 100644 --- a/src/Text/Pandoc/Lua/Util.hs +++ b/src/Text/Pandoc/Lua/Util.hs @@ -3,8 +3,8 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Lua.Util - Copyright : © 2012–2020 John MacFarlane, - © 2017-2020 Albert Krewinkel + Copyright : © 2012-2021 John MacFarlane, + © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Lua/Walk.hs b/src/Text/Pandoc/Lua/Walk.hs index 695c7b44e..d6d973496 100644 --- a/src/Text/Pandoc/Lua/Walk.hs +++ b/src/Text/Pandoc/Lua/Walk.hs @@ -4,8 +4,8 @@ {-# LANGUAGE MultiParamTypeClasses #-} {- | Module : Text.Pandoc.Lua.Walk -Copyright : © 2012–2020 John MacFarlane, - © 2017-2020 Albert Krewinkel +Copyright : © 2012-2021 John MacFarlane, + © 2017-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> Stability : alpha diff --git a/src/Text/Pandoc/MIME.hs b/src/Text/Pandoc/MIME.hs index 4fe25ebe1..77c7069e9 100644 --- a/src/Text/Pandoc/MIME.hs +++ b/src/Text/Pandoc/MIME.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.MIME - Copyright : Copyright (C) 2011-2020 John MacFarlane + Copyright : Copyright (C) 2011-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -10,8 +10,13 @@ Mime type lookup. -} -module Text.Pandoc.MIME ( MimeType, getMimeType, getMimeTypeDef, - extensionFromMimeType, mediaCategory ) where +module Text.Pandoc.MIME ( + MimeType, + getMimeType, + getMimeTypeDef, + getCharset, + extensionFromMimeType, + mediaCategory ) where import Data.List (isPrefixOf, isSuffixOf) import qualified Data.Map as M import qualified Data.Text as T @@ -38,6 +43,16 @@ getMimeTypeDef :: FilePath -> MimeType getMimeTypeDef = fromMaybe "application/octet-stream" . getMimeType extensionFromMimeType :: MimeType -> Maybe T.Text +-- few special cases, where there are multiple options: +extensionFromMimeType "text/plain" = Just "txt" +extensionFromMimeType "video/quicktime" = Just "mov" +extensionFromMimeType "video/mpeg" = Just "mpeg" +extensionFromMimeType "video/dv" = Just "dv" +extensionFromMimeType "image/vnd.djvu" = Just "djvu" +extensionFromMimeType "image/tiff" = Just "tiff" +extensionFromMimeType "image/jpeg" = Just "jpg" +extensionFromMimeType "application/xml" = Just "xml" +extensionFromMimeType "application/ogg" = Just "ogg" extensionFromMimeType mimetype = M.lookup (T.takeWhile (/=';') mimetype) reverseMimeTypes -- note: we just look up the basic mime type, dropping the content-encoding etc. @@ -54,6 +69,14 @@ reverseMimeTypes = M.fromList $ map swap mimeTypesList mimeTypes :: M.Map T.Text MimeType mimeTypes = M.fromList mimeTypesList +-- | Get the charset from a mime type, if one is present. +getCharset :: MimeType -> Maybe T.Text +getCharset mt = + let (_,y) = T.breakOn "charset=" mt + in if T.null y + then Nothing + else Just $ T.toUpper $ T.takeWhile (/= ';') $ T.drop 8 y + -- | Collection of common mime types. -- Except for first entry, list borrowed from -- <https://github.com/Happstack/happstack-server/blob/master/src/Happstack/Server/FileServe/BuildingBlocks.hs happstack-server> diff --git a/src/Text/Pandoc/MediaBag.hs b/src/Text/Pandoc/MediaBag.hs index 26f44cef0..098e484ee 100644 --- a/src/Text/Pandoc/MediaBag.hs +++ b/src/Text/Pandoc/MediaBag.hs @@ -1,9 +1,10 @@ {-# LANGUAGE CPP #-} +{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {- | Module : Text.Pandoc.MediaBag - Copyright : Copyright (C) 2014-2015, 2017–2020 John MacFarlane + Copyright : Copyright (C) 2014-2015, 2017-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -14,6 +15,7 @@ Definition of a MediaBag object to hold binary resources, and an interface for interacting with it. -} module Text.Pandoc.MediaBag ( + MediaItem(..), MediaBag, deleteMedia, lookupMedia, @@ -24,29 +26,43 @@ module Text.Pandoc.MediaBag ( import qualified Data.ByteString.Lazy as BL import Data.Data (Data) import qualified Data.Map as M -import Data.Maybe (fromMaybe) +import Data.Maybe (fromMaybe, isNothing) import Data.Typeable (Typeable) import System.FilePath -import qualified System.FilePath.Posix as Posix -import Text.Pandoc.MIME (MimeType, getMimeTypeDef) +import Text.Pandoc.MIME (MimeType, getMimeTypeDef, extensionFromMimeType) +import Data.Text (Text) +import qualified Data.Text as T +import Data.Digest.Pure.SHA (sha1, showDigest) +import Network.URI (URI (..), parseURI) + +data MediaItem = + MediaItem + { mediaMimeType :: MimeType + , mediaPath :: FilePath + , mediaContents :: BL.ByteString + } deriving (Eq, Ord, Show, Data, Typeable) -- | A container for a collection of binary resources, with names and -- mime types. Note that a 'MediaBag' is a Monoid, so 'mempty' -- can be used for an empty 'MediaBag', and '<>' can be used to append -- two 'MediaBag's. -newtype MediaBag = MediaBag (M.Map [FilePath] (MimeType, BL.ByteString)) +newtype MediaBag = MediaBag (M.Map Text MediaItem) deriving (Semigroup, Monoid, Data, Typeable) instance Show MediaBag where show bag = "MediaBag " ++ show (mediaDirectory bag) +-- | We represent paths with /, in normalized form. +canonicalize :: FilePath -> Text +canonicalize = T.replace "\\" "/" . T.pack . normalise + -- | Delete a media item from a 'MediaBag', or do nothing if no item corresponds -- to the given path. deleteMedia :: FilePath -- ^ relative path and canonical name of resource -> MediaBag -> MediaBag deleteMedia fp (MediaBag mediamap) = - MediaBag $ M.delete (splitDirectories fp) mediamap + MediaBag $ M.delete (canonicalize fp) mediamap -- | Insert a media item into a 'MediaBag', replacing any existing -- value with the same name. @@ -56,26 +72,41 @@ insertMedia :: FilePath -- ^ relative path and canonical name of resource -> MediaBag -> MediaBag insertMedia fp mbMime contents (MediaBag mediamap) = - MediaBag (M.insert (splitDirectories fp) (mime, contents) mediamap) - where mime = fromMaybe fallback mbMime + MediaBag (M.insert fp' mediaItem mediamap) + where mediaItem = MediaItem{ mediaPath = newpath + , mediaContents = contents + , mediaMimeType = mt } + fp' = canonicalize fp + uri = parseURI fp + newpath = if isRelative fp + && isNothing uri + && ".." `notElem` splitPath fp + then T.unpack fp' + else showDigest (sha1 contents) <> "." <> ext fallback = case takeExtension fp of ".gz" -> getMimeTypeDef $ dropExtension fp _ -> getMimeTypeDef fp + mt = fromMaybe fallback mbMime + path = maybe fp uriPath uri + ext = case takeExtension path of + '.':e -> e + _ -> maybe "" T.unpack $ extensionFromMimeType mt + -- | Lookup a media item in a 'MediaBag', returning mime type and contents. lookupMedia :: FilePath -> MediaBag - -> Maybe (MimeType, BL.ByteString) -lookupMedia fp (MediaBag mediamap) = M.lookup (splitDirectories fp) mediamap + -> Maybe MediaItem +lookupMedia fp (MediaBag mediamap) = M.lookup (canonicalize fp) mediamap -- | Get a list of the file paths stored in a 'MediaBag', with -- their corresponding mime types and the lengths in bytes of the contents. mediaDirectory :: MediaBag -> [(FilePath, MimeType, Int)] -mediaDirectory (MediaBag mediamap) = - M.foldrWithKey (\fp (mime,contents) -> - ((Posix.joinPath fp, mime, fromIntegral $ BL.length contents):)) [] mediamap +mediaDirectory mediabag = + map (\(fp, mt, bs) -> (fp, mt, fromIntegral (BL.length bs))) + (mediaItems mediabag) mediaItems :: MediaBag -> [(FilePath, MimeType, BL.ByteString)] mediaItems (MediaBag mediamap) = - M.foldrWithKey (\fp (mime,contents) -> - ((Posix.joinPath fp, mime, contents):)) [] mediamap + map (\item -> (mediaPath item, mediaMimeType item, mediaContents item)) + (M.elems mediamap) diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs index c7f1a56fa..85d9aa103 100644 --- a/src/Text/Pandoc/Options.hs +++ b/src/Text/Pandoc/Options.hs @@ -6,7 +6,7 @@ {-# LANGUAGE TemplateHaskell #-} {- | Module : Text.Pandoc.Options - Copyright : Copyright (C) 2012-2020 John MacFarlane + Copyright : Copyright (C) 2012-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -65,6 +65,7 @@ data ReaderOptions = ReaderOptions{ , readerDefaultImageExtension :: Text -- ^ Default extension for images , readerTrackChanges :: TrackChanges -- ^ Track changes setting for docx , readerStripComments :: Bool -- ^ Strip HTML comments instead of parsing as raw HTML + -- (only implemented in commonmark) } deriving (Show, Read, Data, Typeable, Generic) instance HasSyntaxExtensions ReaderOptions where @@ -315,6 +316,23 @@ defaultKaTeXURL :: Text defaultKaTeXURL = "https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.11.1/" -- Update documentation in doc/filters.md if this is changed. +$(deriveJSON defaultOptions{ constructorTagModifier = + camelCaseStrToHyphenated + } ''TrackChanges) + +$(deriveJSON defaultOptions{ constructorTagModifier = + camelCaseStrToHyphenated + } ''WrapOption) + +$(deriveJSON defaultOptions{ constructorTagModifier = + camelCaseStrToHyphenated . drop 8 + } ''TopLevelDivision) + +$(deriveJSON defaultOptions{ constructorTagModifier = + camelCaseStrToHyphenated + } ''ReferenceLocation) + +-- Update documentation in doc/filters.md if this is changed. $(deriveJSON defaultOptions ''ReaderOptions) $(deriveJSON defaultOptions{ @@ -337,20 +355,3 @@ $(deriveJSON defaultOptions{ constructorTagModifier = } ''ObfuscationMethod) $(deriveJSON defaultOptions ''HTMLSlideVariant) - --- Update documentation in doc/filters.md if this is changed. -$(deriveJSON defaultOptions{ constructorTagModifier = - camelCaseStrToHyphenated - } ''TrackChanges) - -$(deriveJSON defaultOptions{ constructorTagModifier = - camelCaseStrToHyphenated - } ''WrapOption) - -$(deriveJSON defaultOptions{ constructorTagModifier = - camelCaseStrToHyphenated . drop 8 - } ''TopLevelDivision) - -$(deriveJSON defaultOptions{ constructorTagModifier = - camelCaseStrToHyphenated - } ''ReferenceLocation) diff --git a/src/Text/Pandoc/PDF.hs b/src/Text/Pandoc/PDF.hs index c4080a227..c4e30af34 100644 --- a/src/Text/Pandoc/PDF.hs +++ b/src/Text/Pandoc/PDF.hs @@ -4,7 +4,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.PDF - Copyright : Copyright (C) 2012-2020 John MacFarlane + Copyright : Copyright (C) 2012-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -43,6 +43,7 @@ import Text.Pandoc.Definition import Text.Pandoc.Error (PandocError (PandocPDFProgramNotFoundError)) import Text.Pandoc.MIME (getMimeType) import Text.Pandoc.Options (HTMLMathMethod (..), WriterOptions (..)) +import Text.Pandoc.Extensions (disableExtension, Extension(Ext_smart)) import Text.Pandoc.Process (pipeProcess) import System.Process (readProcessWithExitCode) import Text.Pandoc.Shared (inDirectory, stringify, tshow) @@ -114,7 +115,10 @@ makePDF program pdfargs writer opts doc = runIOorExplode $ do putCommonState commonState doc' <- handleImages opts tmpdir doc - source <- writer opts doc' + source <- writer opts{ writerExtensions = -- disable use of quote + -- ligatures to avoid bad ligatures like ?` + disableExtension Ext_smart + (writerExtensions opts) } doc' res <- case baseProg of "context" -> context2pdf verbosity program pdfargs tmpdir source "tectonic" -> tectonic2pdf verbosity program pdfargs tmpdir source @@ -198,7 +202,7 @@ convertImage opts tmpdir fname = do Just "image/svg+xml" -> E.catch (do (exit, _) <- pipeProcess Nothing "rsvg-convert" ["-f","pdf","-a","--dpi-x",dpi,"--dpi-y",dpi, - "-o",pdfOut,fname] BL.empty + "-o",pdfOut,svgIn] BL.empty if exit == ExitSuccess then return $ Right pdfOut else return $ Left "conversion from SVG failed") @@ -211,8 +215,9 @@ convertImage opts tmpdir fname = do E.catch (Right pngOut <$ JP.savePngImage pngOut img) $ \(e :: E.SomeException) -> return (Left (tshow e)) where - pngOut = replaceDirectory (replaceExtension fname ".png") tmpdir - pdfOut = replaceDirectory (replaceExtension fname ".pdf") tmpdir + pngOut = normalise $ replaceDirectory (replaceExtension fname ".png") tmpdir + pdfOut = normalise $ replaceDirectory (replaceExtension fname ".pdf") tmpdir + svgIn = normalise fname mime = getMimeType fname doNothing = return (Right fname) @@ -266,7 +271,7 @@ missingCharacterWarnings verbosity log' = do | isAscii c = T.singleton c | otherwise = T.pack $ c : " (U+" ++ printf "%04X" (ord c) ++ ")" let addCodePoint = T.concatMap toCodePoint - let warnings = [ addCodePoint (T.pack $ utf8ToString (BC.drop 19 l)) + let warnings = [ addCodePoint (utf8ToText (BC.drop 19 l)) | l <- ls , isMissingCharacterWarning l ] @@ -310,7 +315,7 @@ runTectonic verbosity program args' tmpDir' source = do env <- liftIO getEnvironment when (verbosity >= INFO) $ liftIO $ showVerboseInfo (Just tmpDir) program programArgs env - (utf8ToString sourceBL) + (utf8ToText sourceBL) (exit, out) <- liftIO $ E.catch (pipeProcess (Just env) program programArgs sourceBL) (handlePDFProgramNotFound program) @@ -381,7 +386,7 @@ runTeXProgram verbosity program args numRuns tmpDir' source = do (pipeProcess (Just env'') program programArgs BL.empty) (handlePDFProgramNotFound program) when (verbosity >= INFO) $ liftIO $ do - UTF8.hPutStrLn stderr $ "[makePDF] Run #" ++ show runNumber + UTF8.hPutStrLn stderr $ "[makePDF] Run #" <> tshow runNumber BL.hPutStr stderr out UTF8.hPutStr stderr "\n" if runNumber < numRuns @@ -401,7 +406,7 @@ generic2pdf :: Verbosity generic2pdf verbosity program args source = do env' <- getEnvironment when (verbosity >= INFO) $ - showVerboseInfo Nothing program args env' (T.unpack source) + showVerboseInfo Nothing program args env' source (exit, out) <- E.catch (pipeProcess (Just env') program args (BL.fromStrict $ UTF8.fromText source)) @@ -490,19 +495,32 @@ showVerboseInfo :: Maybe FilePath -> String -> [String] -> [(String, String)] - -> String + -> Text -> IO () showVerboseInfo mbTmpDir program programArgs env source = do case mbTmpDir of Just tmpDir -> do UTF8.hPutStrLn stderr "[makePDF] temp dir:" - UTF8.hPutStrLn stderr tmpDir + UTF8.hPutStrLn stderr (T.pack tmpDir) Nothing -> return () UTF8.hPutStrLn stderr "[makePDF] Command line:" - UTF8.hPutStrLn stderr $ program ++ " " ++ unwords (map show programArgs) + UTF8.hPutStrLn stderr $ + T.pack program <> " " <> T.pack (unwords (map show programArgs)) UTF8.hPutStr stderr "\n" - UTF8.hPutStrLn stderr "[makePDF] Environment:" - mapM_ (UTF8.hPutStrLn stderr . show) env + UTF8.hPutStrLn stderr "[makePDF] Relevant environment variables:" + -- we filter out irrelevant stuff to avoid leaking passwords and keys! + let isRelevant ("PATH",_) = True + isRelevant ("TMPDIR",_) = True + isRelevant ("PWD",_) = True + isRelevant ("LANG",_) = True + isRelevant ("HOME",_) = True + isRelevant ("LUA_PATH",_) = True + isRelevant ("LUA_CPATH",_) = True + isRelevant ("SHELL",_) = True + isRelevant ("TEXINPUTS",_) = True + isRelevant ("TEXMFOUTPUT",_) = True + isRelevant _ = False + mapM_ (UTF8.hPutStrLn stderr . tshow) (filter isRelevant env) UTF8.hPutStr stderr "\n" UTF8.hPutStrLn stderr "[makePDF] Source:" UTF8.hPutStrLn stderr source @@ -513,8 +531,8 @@ handlePDFProgramNotFound program e E.throwIO $ PandocPDFProgramNotFoundError $ T.pack program | otherwise = E.throwIO e -utf8ToString :: ByteString -> String -utf8ToString lbs = +utf8ToText :: ByteString -> Text +utf8ToText lbs = case decodeUtf8' lbs of - Left _ -> BC.unpack lbs -- if decoding fails, treat as latin1 - Right t -> TL.unpack t + Left _ -> T.pack $ BC.unpack lbs -- if decoding fails, treat as latin1 + Right t -> TL.toStrict t diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 4bae8942b..09445622d 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -5,11 +5,10 @@ {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE IncoherentInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} -{-# LANGUAGE ViewPatterns #-} {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Parsing - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -19,8 +18,7 @@ A utility library with parsers used in pandoc readers. -} -module Text.Pandoc.Parsing ( take1WhileP, - takeP, +module Text.Pandoc.Parsing ( module Text.Pandoc.Sources, countChar, textStr, anyLine, @@ -105,14 +103,14 @@ module Text.Pandoc.Parsing ( take1WhileP, singleQuoteEnd, doubleQuoteStart, doubleQuoteEnd, - ellipses, apostrophe, + doubleCloseQuote, + ellipses, dash, nested, citeKey, Parser, ParserT, - F, Future(..), runF, askF, @@ -123,7 +121,6 @@ module Text.Pandoc.Parsing ( take1WhileP, (<+?>), extractIdClass, insertIncludedFile, - insertIncludedFileF, -- * Re-exports from Text.Parsec Stream, runParser, @@ -134,22 +131,10 @@ module Text.Pandoc.Parsing ( take1WhileP, getInput, setInput, unexpected, - char, - letter, - digit, - alphaNum, skipMany, skipMany1, - spaces, - space, - anyChar, - satisfy, - newline, - string, count, eof, - noneOf, - oneOf, lookAhead, notFollowedBy, many, @@ -174,6 +159,8 @@ module Text.Pandoc.Parsing ( take1WhileP, SourcePos, getPosition, setPosition, + sourceName, + setSourceName, sourceColumn, sourceLine, setSourceColumn, @@ -189,48 +176,141 @@ module Text.Pandoc.Parsing ( take1WhileP, where import Control.Monad.Identity + ( guard, + join, + unless, + when, + void, + liftM2, + liftM, + Identity(..), + MonadPlus(mzero) ) import Control.Monad.Reader -import Data.Char (chr, isAlphaNum, isAscii, isAsciiUpper, - isPunctuation, isSpace, ord, toLower, toUpper) -import Data.Default + ( asks, runReader, MonadReader(ask), Reader, ReaderT(ReaderT) ) +import Data.Char (chr, isAlphaNum, isAscii, isAsciiUpper, isAsciiLower, + isSpace, ord, toLower, toUpper) +import Data.Default ( Default(..) ) import Data.Functor (($>)) import Data.List (intercalate, transpose) import qualified Data.Map as M -import Data.Maybe (mapMaybe, fromMaybe) +import Data.Maybe (fromMaybe) import qualified Data.Set as Set -import Data.String import Data.Text (Text) import qualified Data.Text as T import Text.HTML.TagSoup.Entity (lookupEntity) -import Text.Pandoc.Asciify (toAsciiChar) +import Text.Pandoc.Asciify (toAsciiText) import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines, trimInlines) import qualified Text.Pandoc.Builder as B import Text.Pandoc.Class.PandocMonad (PandocMonad, readFileFromDirs, report) import Text.Pandoc.Definition + ( Target, + nullMeta, + nullAttr, + Meta, + ColWidth(ColWidthDefault, ColWidth), + TableFoot(TableFoot), + TableBody(TableBody), + Attr, + TableHead(TableHead), + Row(..), + Alignment(..), + Inline(Str), + ListNumberDelim(..), + ListAttributes, + ListNumberStyle(..) ) import Text.Pandoc.Logging + ( LogMessage(CouldNotLoadIncludeFile, DuplicateIdentifier) ) import Text.Pandoc.Options + ( extensionEnabled, + Extension(Ext_old_dashes, Ext_tex_math_dollars, + Ext_tex_math_single_backslash, Ext_tex_math_double_backslash, + Ext_auto_identifiers, Ext_ascii_identifiers, Ext_smart), + ReaderOptions(readerTabStop, readerColumns, readerExtensions) ) import Text.Pandoc.Readers.LaTeX.Types (Macro) import Text.Pandoc.Shared + ( uniqueIdent, + tshow, + mapLeft, + compactify, + trim, + trimr, + splitTextByIndices, + safeRead, + trimMath, + schemes, + escapeURI ) +import Text.Pandoc.Sources import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) import Text.Pandoc.XML (fromEntities) -import Text.Parsec hiding (token) -import Text.Parsec.Pos (initialPos, newPos, updatePosString) - -import Control.Monad.Except +import Text.Parsec + ( between, + setSourceName, + Parsec, + Column, + Line, + incSourceLine, + incSourceColumn, + setSourceLine, + setSourceColumn, + sourceLine, + sourceColumn, + sourceName, + setSourceName, + setPosition, + getPosition, + updateState, + setState, + getState, + optionMaybe, + optional, + option, + endBy1, + endBy, + sepEndBy1, + sepEndBy, + sepBy1, + sepBy, + try, + choice, + (<?>), + (<|>), + manyTill, + many1, + many, + notFollowedBy, + lookAhead, + eof, + count, + skipMany1, + skipMany, + unexpected, + setInput, + getInput, + anyToken, + tokenPrim, + parse, + runParserT, + runParser, + ParseError, + ParsecT, + SourcePos, + Stream(..) ) +import Text.Parsec.Pos (initialPos, newPos) +import Control.Monad.Except ( MonadError(throwError) ) import Text.Pandoc.Error + ( PandocError(PandocParseError, PandocParsecError) ) type Parser t s = Parsec t s type ParserT = ParsecT + -- | Reader monad wrapping the parser state. This is used to possibly delay -- evaluation until all relevant information has been parsed and made available -- in the parser state. newtype Future s a = Future { runDelayed :: Reader s a } deriving (Monad, Applicative, Functor) -type F = Future ParserState - runF :: Future s a -> s -> a runF = runReader . runDelayed @@ -253,70 +333,48 @@ instance (Semigroup a, Monoid a) => Monoid (Future s a) where mappend = (<>) -- | Like @count@, but packs its result -countChar :: (Stream s m Char, Monad m) +countChar :: (Stream s m Char, UpdateSourcePos s Char, Monad m) => Int -> ParsecT s st m Char -> ParsecT s st m Text countChar n = fmap T.pack . count n -- | Like @string@, but uses @Text@. -textStr :: Stream s m Char => Text -> ParsecT s u m Text +textStr :: (Stream s m Char, UpdateSourcePos s Char) + => Text -> ParsecT s u m Text textStr t = string (T.unpack t) $> t --- | Parse characters while a predicate is true. -take1WhileP :: Monad m - => (Char -> Bool) - -> ParserT Text st m Text -take1WhileP f = do - -- needed to persuade parsec that this won't match an empty string: - c <- satisfy f - inp <- getInput - pos <- getPosition - let (t, rest) = T.span f inp - setInput rest - setPosition $ - if f '\t' || f '\n' - then updatePosString pos $ T.unpack t - else incSourceColumn pos (T.length t) - return $ T.singleton c <> t - --- Parse n characters of input (or the rest of the input if --- there aren't n characters). -takeP :: Monad m => Int -> ParserT Text st m Text -takeP n = do - guard (n > 0) - -- faster than 'count n anyChar' - inp <- getInput - pos <- getPosition - let (xs, rest) = T.splitAt n inp - -- needed to persuade parsec that this won't match an empty string: - anyChar - setInput rest - setPosition $ updatePosString pos $ T.unpack xs - return xs - --- | Parse any line of text -anyLine :: Monad m => ParserT Text st m Text + +-- | Parse any line of text, returning the contents without the +-- final newline. +anyLine :: Monad m => ParserT Sources st m Text anyLine = do -- This is much faster than: -- manyTill anyChar newline inp <- getInput - pos <- getPosition - case T.break (=='\n') inp of - (this, T.uncons -> Just ('\n', rest)) -> do - -- needed to persuade parsec that this won't match an empty string: - anyChar - setInput rest - setPosition $ incSourceLine (setSourceColumn pos 1) 1 - return this - _ -> mzero + case inp of + Sources [] -> mzero + Sources ((fp,t):inps) -> + -- we assume that lines don't span different input files + case T.break (=='\n') t of + (this, rest) + | T.null rest + , not (null inps) -> + -- line may span different input files, so do it + -- character by character + T.pack <$> manyTill anyChar newline + | otherwise -> do -- either end of inputs or newline in rest + setInput $ Sources ((fp, rest):inps) + char '\n' -- needed so parsec knows we won't match empty string + -- and so source pos is updated + return this -- | Parse any line, include the final newline in the output -anyLineNewline :: Monad m => ParserT Text st m Text +anyLineNewline :: Monad m => ParserT Sources st m Text anyLineNewline = (<> "\n") <$> anyLine -- | Parse indent by specified number of spaces (or equiv. tabs) -indentWith :: Stream s m Char +indentWith :: (Stream s m Char, UpdateSourcePos s Char) => HasReaderOptions st => Int -> ParserT s st m Text indentWith num = do @@ -401,11 +459,13 @@ notFollowedBy' p = try $ join $ do a <- try p return (return ()) -- (This version due to Andrew Pimlott on the Haskell mailing list.) -oneOfStrings' :: Stream s m Char => (Char -> Char -> Bool) -> [Text] -> ParserT s st m Text +oneOfStrings' :: (Stream s m Char, UpdateSourcePos s Char) + => (Char -> Char -> Bool) -> [Text] -> ParserT s st m Text oneOfStrings' f = fmap T.pack . oneOfStrings'' f . fmap T.unpack -- TODO: This should be re-implemented in a Text-aware way -oneOfStrings'' :: Stream s m Char => (Char -> Char -> Bool) -> [String] -> ParserT s st m String +oneOfStrings'' :: (Stream s m Char, UpdateSourcePos s Char) + => (Char -> Char -> Bool) -> [String] -> ParserT s st m String oneOfStrings'' _ [] = Prelude.fail "no strings" oneOfStrings'' matches strs = try $ do c <- anyChar @@ -420,14 +480,16 @@ oneOfStrings'' matches strs = try $ do -- | Parses one of a list of strings. If the list contains -- two strings one of which is a prefix of the other, the longer -- string will be matched if possible. -oneOfStrings :: Stream s m Char => [Text] -> ParserT s st m Text +oneOfStrings :: (Stream s m Char, UpdateSourcePos s Char) + => [Text] -> ParserT s st m Text oneOfStrings = oneOfStrings' (==) -- | Parses one of a list of strings (tried in order), case insensitive. -- TODO: This will not be accurate with general Unicode (neither -- Text.toLower nor Text.toCaseFold can be implemented with a map) -oneOfStringsCI :: Stream s m Char => [Text] -> ParserT s st m Text +oneOfStringsCI :: (Stream s m Char, UpdateSourcePos s Char) + => [Text] -> ParserT s st m Text oneOfStringsCI = oneOfStrings' ciMatch where ciMatch x y = toLower' x == toLower' y -- this optimizes toLower by checking common ASCII case @@ -438,29 +500,41 @@ oneOfStringsCI = oneOfStrings' ciMatch | otherwise = toLower c -- | Parses a space or tab. -spaceChar :: Stream s m Char => ParserT s st m Char +spaceChar :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Char spaceChar = satisfy $ \c -> c == ' ' || c == '\t' -- | Parses a nonspace, nonnewline character. -nonspaceChar :: Stream s m Char => ParserT s st m Char -nonspaceChar = noneOf ['\t', '\n', ' ', '\r'] +nonspaceChar :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Char +nonspaceChar = satisfy (not . isSpaceChar) + +isSpaceChar :: Char -> Bool +isSpaceChar ' ' = True +isSpaceChar '\t' = True +isSpaceChar '\n' = True +isSpaceChar '\r' = True +isSpaceChar _ = False -- | Skips zero or more spaces or tabs. -skipSpaces :: Stream s m Char => ParserT s st m () +skipSpaces :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m () skipSpaces = skipMany spaceChar -- | Skips zero or more spaces or tabs, then reads a newline. -blankline :: Stream s m Char => ParserT s st m Char +blankline :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Char blankline = try $ skipSpaces >> newline -- | Parses one or more blank lines and returns a string of newlines. -blanklines :: Stream s m Char => ParserT s st m Text +blanklines :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Text blanklines = T.pack <$> many1 blankline -- | Gobble n spaces; if tabs are encountered, expand them -- and gobble some or all of their spaces, leaving the rest. gobbleSpaces :: (HasReaderOptions st, Monad m) - => Int -> ParserT Text st m () + => Int -> ParserT Sources st m () gobbleSpaces 0 = return () gobbleSpaces n | n < 0 = error "gobbleSpaces called with negative number" @@ -468,18 +542,26 @@ gobbleSpaces n char ' ' <|> eatOneSpaceOfTab gobbleSpaces (n - 1) -eatOneSpaceOfTab :: (HasReaderOptions st, Monad m) => ParserT Text st m Char +eatOneSpaceOfTab :: (HasReaderOptions st, Monad m) => ParserT Sources st m Char eatOneSpaceOfTab = do - char '\t' + lookAhead (char '\t') + pos <- getPosition tabstop <- getOption readerTabStop + -- replace the tab on the input stream with spaces + let numSpaces = tabstop - ((sourceColumn pos - 1) `mod` tabstop) inp <- getInput - setInput $ T.replicate (tabstop - 1) " " <> inp - return ' ' + setInput $ + case inp of + Sources [] -> error "eatOneSpaceOfTab - empty Sources list" + Sources ((fp,t):rest) -> + -- drop the tab and add spaces + Sources ((fp, T.replicate numSpaces " " <> T.drop 1 t):rest) + char ' ' -- | Gobble up to n spaces; if tabs are encountered, expand them -- and gobble some or all of their spaces, leaving the rest. gobbleAtMostSpaces :: (HasReaderOptions st, Monad m) - => Int -> ParserT Text st m Int + => Int -> ParserT Sources st m Int gobbleAtMostSpaces 0 = return 0 gobbleAtMostSpaces n | n < 0 = error "gobbleAtMostSpaces called with negative number" @@ -488,7 +570,8 @@ gobbleAtMostSpaces n (+ 1) <$> gobbleAtMostSpaces (n - 1) -- | Parses material enclosed between start and end parsers. -enclosed :: (Show end, Stream s m Char) => ParserT s st m t -- ^ start parser +enclosed :: (Show end, Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m t -- ^ start parser -> ParserT s st m end -- ^ end parser -> ParserT s st m a -- ^ content parser (to be used repeatedly) -> ParserT s st m [a] @@ -496,39 +579,41 @@ enclosed start end parser = try $ start >> notFollowedBy space >> many1Till parser end -- | Parse string, case insensitive. -stringAnyCase :: Stream s m Char => Text -> ParserT s st m Text +stringAnyCase :: (Stream s m Char, UpdateSourcePos s Char) + => Text -> ParserT s st m Text stringAnyCase = fmap T.pack . stringAnyCase' . T.unpack -stringAnyCase' :: Stream s m Char => String -> ParserT s st m String +stringAnyCase' :: (Stream s m Char, UpdateSourcePos s Char) + => String -> ParserT s st m String stringAnyCase' [] = string "" stringAnyCase' (x:xs) = do firstChar <- char (toUpper x) <|> char (toLower x) rest <- stringAnyCase' xs return (firstChar:rest) +-- TODO rewrite by just adding to Sources stream? -- | Parse contents of 'str' using 'parser' and return result. -parseFromString :: (Stream s m Char, IsString s) - => ParserT s st m r +parseFromString :: Monad m + => ParserT Sources st m r -> Text - -> ParserT s st m r + -> ParserT Sources st m r parseFromString parser str = do oldPos <- getPosition setPosition $ initialPos "chunk" oldInput <- getInput - setInput $ fromString $ T.unpack str + setInput $ toSources str result <- parser spaces - eof setInput oldInput setPosition oldPos return result -- | Like 'parseFromString' but specialized for 'ParserState'. -- This resets 'stateLastStrPos', which is almost always what we want. -parseFromString' :: (Stream s m Char, IsString s, HasLastStrPosition u) - => ParserT s u m a +parseFromString' :: (Monad m, HasLastStrPosition u) + => ParserT Sources u m a -> Text - -> ParserT s u m a + -> ParserT Sources u m a parseFromString' parser str = do oldLastStrPos <- getLastStrPos <$> getState updateState $ setLastStrPos Nothing @@ -537,7 +622,7 @@ parseFromString' parser str = do return res -- | Parse raw line block up to and including blank lines. -lineClump :: Monad m => ParserT Text st m Text +lineClump :: Monad m => ParserT Sources st m Text lineClump = blanklines <|> (T.unlines <$> many1 (notFollowedBy blankline >> anyLine)) @@ -546,7 +631,7 @@ lineClump = blanklines -- pairs of open and close, which must be different. For example, -- @charsInBalanced '(' ')' anyChar@ will parse "(hello (there))" -- and return "hello (there)". -charsInBalanced :: Stream s m Char => Char -> Char -> ParserT s st m Char +charsInBalanced :: (Stream s m Char, UpdateSourcePos s Char) => Char -> Char -> ParserT s st m Char -> ParserT s st m Text charsInBalanced open close parser = try $ do char open @@ -565,7 +650,7 @@ charsInBalanced open close parser = try $ do -- Auxiliary functions for romanNumeral: -- | Parses a roman numeral (uppercase or lowercase), returns number. -romanNumeral :: Stream s m Char => Bool -- ^ Uppercase if true +romanNumeral :: (Stream s m Char, UpdateSourcePos s Char) => Bool -- ^ Uppercase if true -> ParserT s st m Int romanNumeral upperCase = do let rchar uc = char $ if upperCase then uc else toLower uc @@ -601,20 +686,19 @@ romanNumeral upperCase = do -- | Parses an email address; returns original and corresponding -- escaped mailto: URI. -emailAddress :: Stream s m Char => ParserT s st m (Text, Text) +emailAddress :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (Text, Text) emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain) where toResult mbox dom = let full = fromEntities $ T.pack $ mbox ++ '@':dom in (full, escapeURI $ "mailto:" <> full) mailbox = intercalate "." <$> (emailWord `sepBy1'` dot) domain = intercalate "." <$> (subdomain `sepBy1'` dot) dot = char '.' - subdomain = many1 $ alphaNum <|> innerPunct + subdomain = many1 $ alphaNum <|> innerPunct (=='-') -- this excludes some valid email addresses, since an -- email could contain e.g. '__', but gives better results -- for our purposes, when combined with markdown parsing: - innerPunct = try (satisfy (\c -> isEmailPunct c || c == '@') - <* notFollowedBy space - <* notFollowedBy (satisfy isPunctuation)) + innerPunct f = try (satisfy f + <* notFollowedBy (satisfy (not . isAlphaNum))) -- technically an email address could begin with a symbol, -- but allowing this creates too many problems. -- See e.g. https://github.com/jgm/pandoc/issues/2940 @@ -625,16 +709,16 @@ emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain) isEmailPunct c = T.any (== c) "!\"#$%&'*+-/=?^_{|}~;" -uriScheme :: Stream s m Char => ParserT s st m Text +uriScheme :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Text uriScheme = oneOfStringsCI (Set.toList schemes) -- | Parses a URI. Returns pair of original and URI-escaped version. -uri :: Stream s m Char => ParserT s st m (Text, Text) +uri :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (Text, Text) uri = try $ do scheme <- uriScheme char ':' -- Avoid parsing e.g. "**Notes:**" as a raw URI: - notFollowedBy (oneOf "*_]") + notFollowedBy $ satisfy (\c -> c == '*' || c == '_' || c == ']') -- We allow sentence punctuation except at the end, since -- we don't want the trailing '.' in 'http://google.com.' We want to allow -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation) @@ -648,7 +732,20 @@ uri = try $ do let uri' = scheme <> ":" <> fromEntities str' return (uri', escapeURI uri') where - wordChar = alphaNum <|> oneOf "#$%+/@\\_-&=" + isWordChar '#' = True + isWordChar '$' = True + isWordChar '%' = True + isWordChar '+' = True + isWordChar '/' = True + isWordChar '@' = True + isWordChar '\\' = True + isWordChar '_' = True + isWordChar '-' = True + isWordChar '&' = True + isWordChar '=' = True + isWordChar c = isAlphaNum c + + wordChar = satisfy isWordChar percentEscaped = try $ (:) <$> char '%' <*> many1 hexDigit entity = try $ pure <$> characterReference punct = try $ many1 (char ',') <|> fmap pure (satisfy (\c -> not (isSpace c) && c /= '<' && c /= '>')) @@ -659,11 +756,13 @@ uri = try $ do uriChunkBetween l r = try $ do chunk <- between (char l) (char r) uriChunk return (T.pack $ [l] ++ chunk ++ [r]) -mathInlineWith :: Stream s m Char => Text -> Text -> ParserT s st m Text +mathInlineWith :: (Stream s m Char, UpdateSourcePos s Char) => Text -> Text -> ParserT s st m Text mathInlineWith op cl = try $ do textStr op when (op == "$") $ notFollowedBy space - words' <- many1Till (countChar 1 (noneOf " \t\n\\") + words' <- many1Till ( + (T.singleton <$> + satisfy (\c -> not (isSpaceChar c || c == '\\'))) <|> (char '\\' >> -- This next clause is needed because \text{..} can -- contain $, \(\), etc. @@ -671,17 +770,17 @@ mathInlineWith op cl = try $ do (("\\text" <>) <$> inBalancedBraces 0 "")) <|> (\c -> T.pack ['\\',c]) <$> anyChar)) <|> do (blankline <* notFollowedBy' blankline) <|> - (oneOf " \t" <* skipMany (oneOf " \t")) + (spaceChar <* skipMany spaceChar) notFollowedBy (char '$') return " " ) (try $ textStr cl) notFollowedBy digit -- to prevent capture of $5 return $ trimMath $ T.concat words' where - inBalancedBraces :: Stream s m Char => Int -> Text -> ParserT s st m Text + inBalancedBraces :: (Stream s m Char, UpdateSourcePos s Char) => Int -> Text -> ParserT s st m Text inBalancedBraces n = fmap T.pack . inBalancedBraces' n . T.unpack - inBalancedBraces' :: Stream s m Char => Int -> String -> ParserT s st m String + inBalancedBraces' :: (Stream s m Char, UpdateSourcePos s Char) => Int -> String -> ParserT s st m String inBalancedBraces' 0 "" = do c <- anyChar if c == '{' @@ -698,12 +797,13 @@ mathInlineWith op cl = try $ do '{' -> inBalancedBraces' (numOpen + 1) (c:xs) _ -> inBalancedBraces' numOpen (c:xs) -mathDisplayWith :: Stream s m Char => Text -> Text -> ParserT s st m Text +mathDisplayWith :: (Stream s m Char, UpdateSourcePos s Char) => Text -> Text -> ParserT s st m Text mathDisplayWith op cl = try $ fmap T.pack $ do textStr op - many1Till (noneOf "\n" <|> (newline <* notFollowedBy' blankline)) (try $ textStr cl) + many1Till (satisfy (/= '\n') <|> (newline <* notFollowedBy' blankline)) + (try $ textStr cl) -mathDisplay :: (HasReaderOptions st, Stream s m Char) +mathDisplay :: (HasReaderOptions st, Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Text mathDisplay = (guardEnabled Ext_tex_math_dollars >> mathDisplayWith "$$" "$$") @@ -712,7 +812,7 @@ mathDisplay = <|> (guardEnabled Ext_tex_math_double_backslash >> mathDisplayWith "\\\\[" "\\\\]") -mathInline :: (HasReaderOptions st , Stream s m Char) +mathInline :: (HasReaderOptions st, Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Text mathInline = (guardEnabled Ext_tex_math_dollars >> mathInlineWith "$" "$") @@ -725,7 +825,7 @@ mathInline = -- displacement (the difference between the source column at the end -- and the source column at the beginning). Vertical displacement -- (source row) is ignored. -withHorizDisplacement :: Stream s m Char +withHorizDisplacement :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m a -- ^ Parser to apply -> ParserT s st m (a, Int) -- ^ (result, displacement) withHorizDisplacement parser = do @@ -737,30 +837,37 @@ withHorizDisplacement parser = do -- | Applies a parser and returns the raw string that was parsed, -- along with the value produced by the parser. withRaw :: Monad m - => ParsecT Text st m a - -> ParsecT Text st m (a, Text) + => ParsecT Sources st m a + -> ParsecT Sources st m (a, Text) withRaw parser = do - pos1 <- getPosition - inp <- getInput + inps1 <- getInput result <- parser - pos2 <- getPosition - let (l1,c1) = (sourceLine pos1, sourceColumn pos1) - let (l2,c2) = (sourceLine pos2, sourceColumn pos2) - let inplines = take ((l2 - l1) + 1) $ T.lines inp - let raw = case inplines of - [] -> "" - [l] -> T.take (c2 - c1) l - ls -> T.unlines (init ls) <> T.take (c2 - 1) (last ls) - return (result, raw) + inps2 <- getInput + -- 'raw' is the difference between inps1 and inps2 + return (result, sourcesDifference inps1 inps2) + +sourcesDifference :: Sources -> Sources -> Text +sourcesDifference (Sources is1) (Sources is2) = go is1 is2 + where + go inps1 inps2 = + case (inps1, inps2) of + ([], _) -> mempty + (_, []) -> mconcat $ map snd inps1 + ((p1,t1):rest1, (p2, t2):rest2) + | p1 == p2 + , t1 == t2 -> go rest1 rest2 + | p1 == p2 + , t1 /= t2 -> fromMaybe mempty $ T.stripSuffix t2 t1 + | otherwise -> t1 <> go rest1 inps2 -- | Parses backslash, then applies character parser. -escaped :: Stream s m Char +escaped :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Char -- ^ Parser for character to escape -> ParserT s st m Char escaped parser = try $ char '\\' >> parser -- | Parse character entity. -characterReference :: Stream s m Char => ParserT s st m Char +characterReference :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Char characterReference = try $ do char '&' ent <- many1Till nonspaceChar (char ';') @@ -773,19 +880,19 @@ characterReference = try $ do _ -> Prelude.fail "entity not found" -- | Parses an uppercase roman numeral and returns (UpperRoman, number). -upperRoman :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +upperRoman :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) upperRoman = do num <- romanNumeral True return (UpperRoman, num) -- | Parses a lowercase roman numeral and returns (LowerRoman, number). -lowerRoman :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +lowerRoman :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) lowerRoman = do num <- romanNumeral False return (LowerRoman, num) -- | Parses a decimal numeral and returns (Decimal, number). -decimal :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +decimal :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) decimal = do num <- many1 digit return (Decimal, fromMaybe 1 $ safeRead $ T.pack num) @@ -794,7 +901,7 @@ decimal = do -- returns (DefaultStyle, [next example number]). The next -- example number is incremented in parser state, and the label -- (if present) is added to the label table. -exampleNum :: Stream s m Char +exampleNum :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s ParserState m (ListNumberStyle, Int) exampleNum = do char '@' @@ -813,37 +920,37 @@ exampleNum = do return (Example, num) -- | Parses a '#' returns (DefaultStyle, 1). -defaultNum :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +defaultNum :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) defaultNum = do char '#' return (DefaultStyle, 1) -- | Parses a lowercase letter and returns (LowerAlpha, number). -lowerAlpha :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +lowerAlpha :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) lowerAlpha = do - ch <- oneOf ['a'..'z'] + ch <- satisfy isAsciiLower return (LowerAlpha, ord ch - ord 'a' + 1) -- | Parses an uppercase letter and returns (UpperAlpha, number). -upperAlpha :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +upperAlpha :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) upperAlpha = do - ch <- oneOf ['A'..'Z'] + ch <- satisfy isAsciiUpper return (UpperAlpha, ord ch - ord 'A' + 1) -- | Parses a roman numeral i or I -romanOne :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +romanOne :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) romanOne = (char 'i' >> return (LowerRoman, 1)) <|> (char 'I' >> return (UpperRoman, 1)) -- | Parses an ordered list marker and returns list attributes. -anyOrderedListMarker :: Stream s m Char => ParserT s ParserState m ListAttributes +anyOrderedListMarker :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s ParserState m ListAttributes anyOrderedListMarker = choice [delimParser numParser | delimParser <- [inPeriod, inOneParen, inTwoParens], numParser <- [decimal, exampleNum, defaultNum, romanOne, lowerAlpha, lowerRoman, upperAlpha, upperRoman]] -- | Parses a list number (num) followed by a period, returns list attributes. -inPeriod :: Stream s m Char +inPeriod :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) -> ParserT s st m ListAttributes inPeriod num = try $ do @@ -855,7 +962,7 @@ inPeriod num = try $ do return (start, style, delim) -- | Parses a list number (num) followed by a paren, returns list attributes. -inOneParen :: Stream s m Char +inOneParen :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) -> ParserT s st m ListAttributes inOneParen num = try $ do @@ -864,7 +971,7 @@ inOneParen num = try $ do return (start, style, OneParen) -- | Parses a list number (num) enclosed in parens, returns list attributes. -inTwoParens :: Stream s m Char +inTwoParens :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int) -> ParserT s st m ListAttributes inTwoParens num = try $ do @@ -875,7 +982,7 @@ inTwoParens num = try $ do -- | Parses an ordered list marker with a given style and delimiter, -- returns number. -orderedListMarker :: Stream s m Char +orderedListMarker :: (Stream s m Char, UpdateSourcePos s Char) => ListNumberStyle -> ListNumberDelim -> ParserT s ParserState m Int @@ -898,10 +1005,10 @@ orderedListMarker style delim = do return start -- | Parses a character reference and returns a Str element. -charRef :: Stream s m Char => ParserT s st m Inline +charRef :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inline charRef = Str . T.singleton <$> characterReference -lineBlockLine :: Monad m => ParserT Text st m Text +lineBlockLine :: Monad m => ParserT Sources st m Text lineBlockLine = try $ do char '|' char ' ' @@ -911,11 +1018,11 @@ lineBlockLine = try $ do continuations <- many (try $ char ' ' >> anyLine) return $ white <> T.unwords (line : continuations) -blankLineBlockLine :: Stream s m Char => ParserT s st m Char +blankLineBlockLine :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Char blankLineBlockLine = try (char '|' >> blankline) -- | Parses an RST-style line block and returns a list of strings. -lineBlockLines :: Monad m => ParserT Text st m [Text] +lineBlockLines :: Monad m => ParserT Sources st m [Text] lineBlockLines = try $ do lines' <- many1 (lineBlockLine <|> (T.singleton <$> blankLineBlockLine)) skipMany blankline @@ -923,7 +1030,8 @@ lineBlockLines = try $ do -- | Parse a table using 'headerParser', 'rowParser', -- 'lineParser', and 'footerParser'. -tableWith :: (Stream s m Char, HasReaderOptions st, Monad mf) +tableWith :: (Stream s m Char, UpdateSourcePos s Char, + HasReaderOptions st, Monad mf) => ParserT s st m (mf [Blocks], [Alignment], [Int]) -> ([Int] -> ParserT s st m (mf [Blocks])) -> ParserT s st m sep @@ -943,7 +1051,8 @@ tableWith headerParser rowParser lineParser footerParser = try $ do type TableComponents mf = ([Alignment], [Double], mf [Row], mf [Row]) -tableWith' :: (Stream s m Char, HasReaderOptions st, Monad mf) +tableWith' :: (Stream s m Char, UpdateSourcePos s Char, + HasReaderOptions st, Monad mf) => ParserT s st m (mf [Blocks], [Alignment], [Int]) -> ([Int] -> ParserT s st m (mf [Blocks])) -> ParserT s st m sep @@ -992,20 +1101,19 @@ widthsFromIndices numColumns' indices = -- (which may be grid), then the rows, -- which may be grid, separated by blank lines, and -- ending with a footer (dashed line followed by blank line). -gridTableWith :: (Stream s m Char, HasReaderOptions st, HasLastStrPosition st, - Monad mf, IsString s) - => ParserT s st m (mf Blocks) -- ^ Block list parser +gridTableWith :: (Monad m, HasReaderOptions st, HasLastStrPosition st, Monad mf) + => ParserT Sources st m (mf Blocks) -- ^ Block list parser -> Bool -- ^ Headerless table - -> ParserT s st m (mf Blocks) + -> ParserT Sources st m (mf Blocks) gridTableWith blocks headless = tableWith (gridTableHeader headless blocks) (gridTableRow blocks) (gridTableSep '-') gridTableFooter -gridTableWith' :: (Stream s m Char, HasReaderOptions st, HasLastStrPosition st, - Monad mf, IsString s) - => ParserT s st m (mf Blocks) -- ^ Block list parser +gridTableWith' :: (Monad m, HasReaderOptions st, HasLastStrPosition st, + Monad mf) + => ParserT Sources st m (mf Blocks) -- ^ Block list parser -> Bool -- ^ Headerless table - -> ParserT s st m (TableComponents mf) + -> ParserT Sources st m (TableComponents mf) gridTableWith' blocks headless = tableWith' (gridTableHeader headless blocks) (gridTableRow blocks) (gridTableSep '-') gridTableFooter @@ -1014,7 +1122,7 @@ gridTableSplitLine :: [Int] -> Text -> [Text] gridTableSplitLine indices line = map removeFinalBar $ tail $ splitTextByIndices (init indices) $ trimr line -gridPart :: Stream s m Char => Char -> ParserT s st m ((Int, Int), Alignment) +gridPart :: Monad m => Char -> ParserT Sources st m ((Int, Int), Alignment) gridPart ch = do leftColon <- option False (True <$ char ':') dashes <- many1 (char ch) @@ -1029,7 +1137,7 @@ gridPart ch = do (False, False) -> AlignDefault return ((lengthDashes, lengthDashes + 1), alignment) -gridDashedLines :: Stream s m Char => Char -> ParserT s st m [((Int, Int), Alignment)] +gridDashedLines :: Monad m => Char -> ParserT Sources st m [((Int, Int), Alignment)] gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline removeFinalBar :: Text -> Text @@ -1038,47 +1146,47 @@ removeFinalBar = T.dropWhileEnd go . T.dropWhileEnd (=='|') go c = T.any (== c) " \t" -- | Separator between rows of grid table. -gridTableSep :: Stream s m Char => Char -> ParserT s st m Char +gridTableSep :: Monad m => Char -> ParserT Sources st m Char gridTableSep ch = try $ gridDashedLines ch >> return '\n' -- | Parse header for a grid table. -gridTableHeader :: (Stream s m Char, Monad mf, IsString s, HasLastStrPosition st) +gridTableHeader :: (Monad m, Monad mf, HasLastStrPosition st) => Bool -- ^ Headerless table - -> ParserT s st m (mf Blocks) - -> ParserT s st m (mf [Blocks], [Alignment], [Int]) -gridTableHeader headless blocks = try $ do + -> ParserT Sources st m (mf Blocks) + -> ParserT Sources st m (mf [Blocks], [Alignment], [Int]) +gridTableHeader True _ = do + optional blanklines + dashes <- gridDashedLines '-' + let aligns = map snd dashes + let lines' = map (snd . fst) dashes + let indices = scanl (+) 0 lines' + return (return [], aligns, indices) +gridTableHeader False blocks = try $ do optional blanklines dashes <- gridDashedLines '-' - rawContent <- if headless - then return $ repeat "" - else many1 - (notFollowedBy (gridTableSep '=') >> char '|' >> + rawContent <- many1 (notFollowedBy (gridTableSep '=') >> char '|' >> T.pack <$> many1Till anyChar newline) - underDashes <- if headless - then return dashes - else gridDashedLines '=' + underDashes <- gridDashedLines '=' guard $ length dashes == length underDashes let lines' = map (snd . fst) underDashes let indices = scanl (+) 0 lines' let aligns = map snd underDashes - let rawHeads = if headless - then replicate (length underDashes) "" - else map (T.unlines . map trim) $ transpose + let rawHeads = map (T.unlines . map trim) $ transpose $ map (gridTableSplitLine indices) rawContent heads <- sequence <$> mapM (parseFromString' blocks . trim) rawHeads return (heads, aligns, indices) -gridTableRawLine :: Stream s m Char => [Int] -> ParserT s st m [Text] +gridTableRawLine :: (Stream s m Char, UpdateSourcePos s Char) => [Int] -> ParserT s st m [Text] gridTableRawLine indices = do char '|' line <- many1Till anyChar newline return (gridTableSplitLine indices $ T.pack line) -- | Parse row of grid table. -gridTableRow :: (Stream s m Char, Monad mf, IsString s, HasLastStrPosition st) - => ParserT s st m (mf Blocks) +gridTableRow :: (Monad m, Monad mf, HasLastStrPosition st) + => ParserT Sources st m (mf Blocks) -> [Int] - -> ParserT s st m (mf [Blocks]) + -> ParserT Sources st m (mf [Blocks]) gridTableRow blocks indices = do colLines <- many1 (gridTableRawLine indices) let cols = map ((<> "\n") . T.unlines . removeOneLeadingSpace) $ @@ -1099,34 +1207,38 @@ removeOneLeadingSpace xs = Just (c, _) -> c == ' ' -- | Parse footer for a grid table. -gridTableFooter :: Stream s m Char => ParserT s st m () +gridTableFooter :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m () gridTableFooter = optional blanklines --- -- | Removes the ParsecT layer from the monad transformer stack -readWithM :: (Stream s m Char, ToText s) - => ParserT s st m a -- ^ parser - -> st -- ^ initial state - -> s -- ^ input +readWithM :: (Monad m, ToSources t) + => ParserT Sources st m a -- ^ parser + -> st -- ^ initial state + -> t -- ^ input -> m (Either PandocError a) readWithM parser state input = - mapLeft (PandocParsecError $ toText input) `liftM` runParserT parser state "source" input + mapLeft (PandocParsecError sources) + <$> runParserT parser state (initialSourceName sources) sources + where + sources = toSources input -- | Parse a string with a given parser and state -readWith :: Parser Text st a +readWith :: ToSources t + => Parser Sources st a -> st - -> Text + -> t -> Either PandocError a readWith p t inp = runIdentity $ readWithM p t inp -- | Parse a string with @parser@ (for testing). testStringWith :: Show a - => ParserT Text ParserState Identity a + => ParserT Sources ParserState Identity a -> Text -> IO () -testStringWith parser str = UTF8.putStrLn $ show $ - readWith parser defaultParserState str +testStringWith parser str = UTF8.putStrLn $ tshow $ + readWith parser defaultParserState (toSources str) -- | Parsing options. data ParserState = ParserState @@ -1146,7 +1258,7 @@ data ParserState = ParserState stateInNote :: Bool, -- ^ True if parsing note contents stateNoteNumber :: Int, -- ^ Last note number for citations stateMeta :: Meta, -- ^ Document metadata - stateMeta' :: F Meta, -- ^ Document metadata + stateMeta' :: Future ParserState Meta, -- ^ Document metadata stateCitations :: M.Map Text Text, -- ^ RST-style citations stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used stateIdentifiers :: Set.Set Text, -- ^ Header identifiers used @@ -1325,7 +1437,7 @@ data QuoteContext type NoteTable = [(Text, Text)] -type NoteTable' = M.Map Text (SourcePos, F Blocks) +type NoteTable' = M.Map Text (SourcePos, Future ParserState Blocks) -- used in markdown reader newtype Key = Key Text deriving (Show, Read, Eq, Ord) @@ -1360,7 +1472,7 @@ registerHeader (ident,classes,kvs) header' = do then do let id' = uniqueIdent exts (B.toList header') ids let id'' = if Ext_ascii_identifiers `extensionEnabled` exts - then T.pack $ mapMaybe toAsciiChar $ T.unpack id' + then toAsciiText id' else id' updateState $ updateIdentifierList $ Set.insert id' updateState $ updateIdentifierList $ Set.insert id'' @@ -1373,34 +1485,42 @@ registerHeader (ident,classes,kvs) header' = do updateState $ updateIdentifierList $ Set.insert ident return (ident,classes,kvs) -smartPunctuation :: (HasReaderOptions st, HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) +smartPunctuation :: (HasReaderOptions st, HasLastStrPosition st, + HasQuoteContext st m, + Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines -> ParserT s st m Inlines smartPunctuation inlineParser = do guardEnabled Ext_smart - choice [ quoted inlineParser, apostrophe, dash, ellipses ] - -apostrophe :: Stream s m Char => ParserT s st m Inlines -apostrophe = (char '\'' <|> char '\8217') >> return (B.str "\x2019") + choice [ quoted inlineParser, apostrophe, doubleCloseQuote, dash, ellipses ] -quoted :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) +quoted :: (HasLastStrPosition st, HasQuoteContext st m, + Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines -> ParserT s st m Inlines quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser -singleQuoted :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) +singleQuoted :: (HasLastStrPosition st, HasQuoteContext st m, + Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines -> ParserT s st m Inlines -singleQuoted inlineParser = try $ B.singleQuoted . mconcat - <$ singleQuoteStart - <*> withQuoteContext InSingleQuote (many1Till inlineParser singleQuoteEnd) - -doubleQuoted :: (HasQuoteContext st m, Stream s m Char) +singleQuoted inlineParser = do + singleQuoteStart + (B.singleQuoted . mconcat <$> + try + (withQuoteContext InSingleQuote (many1Till inlineParser singleQuoteEnd))) + <|> pure "\8217" + +doubleQuoted :: (HasQuoteContext st m, HasLastStrPosition st, + Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines -> ParserT s st m Inlines -doubleQuoted inlineParser = try $ B.doubleQuoted . mconcat - <$ doubleQuoteStart - <*> withQuoteContext InDoubleQuote (manyTill inlineParser doubleQuoteEnd) +doubleQuoted inlineParser = do + doubleQuoteStart + (B.doubleQuoted . mconcat <$> + try + (withQuoteContext InDoubleQuote (manyTill inlineParser doubleQuoteEnd))) + <|> pure (B.str "\8220") failIfInQuoteContext :: (HasQuoteContext st m, Stream s m t) => QuoteContext @@ -1409,13 +1529,14 @@ failIfInQuoteContext context = do context' <- getQuoteContext when (context' == context) $ Prelude.fail "already inside quotes" -charOrRef :: Stream s m Char => [Char] -> ParserT s st m Char +charOrRef :: (Stream s m Char, UpdateSourcePos s Char) => [Char] -> ParserT s st m Char charOrRef cs = oneOf cs <|> try (do c <- characterReference guard (c `elem` cs) return c) -singleQuoteStart :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) +singleQuoteStart :: (HasLastStrPosition st, HasQuoteContext st m, + Stream s m Char, UpdateSourcePos s Char) => ParserT s st m () singleQuoteStart = do failIfInQuoteContext InSingleQuote @@ -1423,30 +1544,39 @@ singleQuoteStart = do guard =<< notAfterString try $ do charOrRef "'\8216\145" - notFollowedBy (oneOf [' ', '\t', '\n']) + void $ lookAhead (satisfy (not . isSpaceChar)) -singleQuoteEnd :: Stream s m Char +singleQuoteEnd :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m () singleQuoteEnd = try $ do charOrRef "'\8217\146" notFollowedBy alphaNum -doubleQuoteStart :: (HasQuoteContext st m, Stream s m Char) +doubleQuoteStart :: (HasLastStrPosition st, + HasQuoteContext st m, + Stream s m Char, UpdateSourcePos s Char) => ParserT s st m () doubleQuoteStart = do failIfInQuoteContext InDoubleQuote + guard =<< notAfterString try $ do charOrRef "\"\8220\147" - notFollowedBy (oneOf [' ', '\t', '\n']) + void $ lookAhead (satisfy (not . isSpaceChar)) -doubleQuoteEnd :: Stream s m Char +doubleQuoteEnd :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m () doubleQuoteEnd = void (charOrRef "\"\8221\148") -ellipses :: Stream s m Char +apostrophe :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines +apostrophe = (char '\'' <|> char '\8217') >> return (B.str "\8217") + +doubleCloseQuote :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines +doubleCloseQuote = B.str "\8221" <$ char '"' + +ellipses :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines ellipses = try (string "..." >> return (B.str "\8230")) -dash :: (HasReaderOptions st, Stream s m Char) +dash :: (HasReaderOptions st, Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines dash = try $ do oldDashes <- extensionEnabled Ext_old_dashes <$> getOption readerExtensions @@ -1473,20 +1603,28 @@ nested p = do updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } return res -citeKey :: (Stream s m Char, HasLastStrPosition st) - => ParserT s st m (Bool, Text) -citeKey = try $ do +citeKey :: (Stream s m Char, UpdateSourcePos s Char, HasLastStrPosition st) + => Bool -- ^ If True, allow expanded @{..} syntax. + -> ParserT s st m (Bool, Text) +citeKey allowBraced = try $ do guard =<< notAfterString suppress_author <- option False (True <$ char '-') char '@' + key <- simpleCiteIdentifier + <|> if allowBraced + then charsInBalanced '{' '}' (satisfy (not . isSpace)) + else mzero + return (suppress_author, key) + +simpleCiteIdentifier :: (Stream s m Char, UpdateSourcePos s Char) + => ParserT s st m Text +simpleCiteIdentifier = do firstChar <- alphaNum <|> char '_' <|> char '*' -- @* for wildcard in nocite let regchar = satisfy (\c -> isAlphaNum c || c == '_') let internal p = try $ p <* lookAhead regchar rest <- many $ regchar <|> internal (oneOf ":.#$%&-+?<>~/") <|> try (oneOf ":/" <* lookAhead (char '/')) - let key = firstChar:rest - return (suppress_author, T.pack key) - + return $ T.pack $ firstChar:rest token :: (Stream s m t) => (t -> Text) @@ -1506,12 +1644,15 @@ extractIdClass (ident, cls, kvs) = (ident', cls', kvs') cls' = maybe cls T.words $ lookup "class" kvs kvs' = filter (\(k,_) -> k /= "id" || k /= "class") kvs -insertIncludedFile' :: (PandocMonad m, HasIncludeFiles st) - => ParserT a st m (mf Blocks) - -> (Text -> a) - -> [FilePath] -> FilePath - -> ParserT a st m (mf Blocks) -insertIncludedFile' blocks totoks dirs f = do +insertIncludedFile :: (PandocMonad m, HasIncludeFiles st) + => ParserT a st m b -- ^ parser to apply + -> (Text -> a) -- ^ convert Text to stream type + -> [FilePath] -- ^ search path (directories) + -> FilePath -- ^ path of file to include + -> Maybe Int -- ^ start line (negative counts from end) + -> Maybe Int -- ^ end line (negative counts from end) + -> ParserT a st m b +insertIncludedFile parser toStream dirs f mbstartline mbendline = do oldPos <- getPosition oldInput <- getInput containers <- getIncludeFiles <$> getState @@ -1520,32 +1661,32 @@ insertIncludedFile' blocks totoks dirs f = do updateState $ addIncludeFile $ T.pack f mbcontents <- readFileFromDirs dirs f contents <- case mbcontents of - Just s -> return s + Just s -> return $ exciseLines mbstartline mbendline s Nothing -> do report $ CouldNotLoadIncludeFile (T.pack f) oldPos return "" - setPosition $ newPos f 1 1 - setInput $ totoks contents - bs <- blocks + setInput $ toStream contents + setPosition $ newPos f (fromMaybe 1 mbstartline) 1 + result <- parser setInput oldInput setPosition oldPos updateState dropLatestIncludeFile - return bs + return result + +exciseLines :: Maybe Int -> Maybe Int -> Text -> Text +exciseLines Nothing Nothing t = t +exciseLines mbstartline mbendline t = + T.unlines $ take (endline' - (startline' - 1)) + $ drop (startline' - 1) contentLines + where + contentLines = T.lines t + numLines = length contentLines + startline' = case mbstartline of + Nothing -> 1 + Just x | x >= 0 -> x + | otherwise -> numLines + x -- negative from end + endline' = case mbendline of + Nothing -> numLines + Just x | x >= 0 -> x + | otherwise -> numLines + x -- negative from end --- | Parse content of include file as blocks. Circular includes result in an --- @PandocParseError@. -insertIncludedFile :: (PandocMonad m, HasIncludeFiles st) - => ParserT [a] st m Blocks - -> (Text -> [a]) - -> [FilePath] -> FilePath - -> ParserT [a] st m Blocks -insertIncludedFile blocks totoks dirs f = - runIdentity <$> insertIncludedFile' (Identity <$> blocks) totoks dirs f - --- | Parse content of include file as future blocks. Circular includes result in --- an @PandocParseError@. -insertIncludedFileF :: (PandocMonad m, HasIncludeFiles st) - => ParserT Text st m (Future st Blocks) - -> [FilePath] -> FilePath - -> ParserT Text st m (Future st Blocks) -insertIncludedFileF p = insertIncludedFile' p id diff --git a/src/Text/Pandoc/Process.hs b/src/Text/Pandoc/Process.hs index 866972e3f..b896feb7e 100644 --- a/src/Text/Pandoc/Process.hs +++ b/src/Text/Pandoc/Process.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Process - Copyright : Copyright (C) 2013-2020 John MacFarlane + Copyright : Copyright (C) 2013-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index 9a069f7d0..5106f8058 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -1,9 +1,10 @@ {-# LANGUAGE MonoLocalBinds #-} +{-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -65,12 +66,14 @@ import Data.Aeson import qualified Data.ByteString.Lazy as BL import Data.Text (Text) import qualified Data.Text as T +import Text.Pandoc.Shared (tshow) import Text.Pandoc.Class import Text.Pandoc.Definition import Text.Pandoc.Error import Text.Pandoc.Extensions import Text.Pandoc.Options import Text.Pandoc.Readers.CommonMark +import Text.Pandoc.Readers.Markdown import Text.Pandoc.Readers.Creole import Text.Pandoc.Readers.DocBook import Text.Pandoc.Readers.Docx @@ -83,7 +86,6 @@ import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Readers.JATS (readJATS) import Text.Pandoc.Readers.Jira (readJira) import Text.Pandoc.Readers.LaTeX -import Text.Pandoc.Readers.Markdown import Text.Pandoc.Readers.MediaWiki import Text.Pandoc.Readers.Muse import Text.Pandoc.Readers.Native @@ -101,59 +103,60 @@ import Text.Pandoc.Readers.CSV import Text.Pandoc.Readers.CslJson import Text.Pandoc.Readers.BibTeX import qualified Text.Pandoc.UTF8 as UTF8 -import Text.Parsec.Error +import Text.Pandoc.Sources (ToSources(..), sourcesToText) -data Reader m = TextReader (ReaderOptions -> Text -> m Pandoc) +data Reader m = TextReader (forall a . ToSources a => + ReaderOptions -> a -> m Pandoc) | ByteStringReader (ReaderOptions -> BL.ByteString -> m Pandoc) -- | Association list of formats and readers. readers :: PandocMonad m => [(Text, Reader m)] -readers = [ ("native" , TextReader readNative) - ,("json" , TextReader readJSON) - ,("markdown" , TextReader readMarkdown) - ,("markdown_strict" , TextReader readMarkdown) - ,("markdown_phpextra" , TextReader readMarkdown) - ,("markdown_github" , TextReader readMarkdown) - ,("markdown_mmd", TextReader readMarkdown) - ,("commonmark" , TextReader readCommonMark) - ,("commonmark_x" , TextReader readCommonMark) - ,("creole" , TextReader readCreole) - ,("dokuwiki" , TextReader readDokuWiki) - ,("gfm" , TextReader readCommonMark) - ,("rst" , TextReader readRST) - ,("mediawiki" , TextReader readMediaWiki) - ,("vimwiki" , TextReader readVimwiki) - ,("docbook" , TextReader readDocBook) - ,("opml" , TextReader readOPML) - ,("org" , TextReader readOrg) - ,("textile" , TextReader readTextile) -- TODO : textile+lhs - ,("html" , TextReader readHtml) - ,("jats" , TextReader readJATS) - ,("jira" , TextReader readJira) - ,("latex" , TextReader readLaTeX) - ,("haddock" , TextReader readHaddock) - ,("twiki" , TextReader readTWiki) - ,("tikiwiki" , TextReader readTikiWiki) - ,("docx" , ByteStringReader readDocx) - ,("odt" , ByteStringReader readOdt) - ,("t2t" , TextReader readTxt2Tags) - ,("epub" , ByteStringReader readEPUB) - ,("muse" , TextReader readMuse) - ,("man" , TextReader readMan) - ,("fb2" , TextReader readFB2) - ,("ipynb" , TextReader readIpynb) - ,("csv" , TextReader readCSV) - ,("csljson" , TextReader readCslJson) - ,("bibtex" , TextReader readBibTeX) - ,("biblatex" , TextReader readBibLaTeX) +readers = [("native" , TextReader readNative) + ,("json" , TextReader readJSON) + ,("markdown" , TextReader readMarkdown) + ,("markdown_strict" , TextReader readMarkdown) + ,("markdown_phpextra" , TextReader readMarkdown) + ,("markdown_github" , TextReader readMarkdown) + ,("markdown_mmd", TextReader readMarkdown) + ,("commonmark" , TextReader readCommonMark) + ,("commonmark_x" , TextReader readCommonMark) + ,("creole" , TextReader readCreole) + ,("dokuwiki" , TextReader readDokuWiki) + ,("gfm" , TextReader readCommonMark) + ,("rst" , TextReader readRST) + ,("mediawiki" , TextReader readMediaWiki) + ,("vimwiki" , TextReader readVimwiki) + ,("docbook" , TextReader readDocBook) + ,("opml" , TextReader readOPML) + ,("org" , TextReader readOrg) + ,("textile" , TextReader readTextile) -- TODO : textile+lhs + ,("html" , TextReader readHtml) + ,("jats" , TextReader readJATS) + ,("jira" , TextReader readJira) + ,("latex" , TextReader readLaTeX) + ,("haddock" , TextReader readHaddock) + ,("twiki" , TextReader readTWiki) + ,("tikiwiki" , TextReader readTikiWiki) + ,("docx" , ByteStringReader readDocx) + ,("odt" , ByteStringReader readOdt) + ,("t2t" , TextReader readTxt2Tags) + ,("epub" , ByteStringReader readEPUB) + ,("muse" , TextReader readMuse) + ,("man" , TextReader readMan) + ,("fb2" , TextReader readFB2) + ,("ipynb" , TextReader readIpynb) + ,("csv" , TextReader readCSV) + ,("csljson" , TextReader readCslJson) + ,("bibtex" , TextReader readBibTeX) + ,("biblatex" , TextReader readBibLaTeX) ] -- | Retrieve reader, extensions based on formatSpec (format+extensions). getReader :: PandocMonad m => Text -> m (Reader m, Extensions) getReader s = case parseFormatSpec s of - Left e -> throwError $ PandocAppError - $ T.intercalate "\n" [T.pack m | Message m <- errorMessages e] + Left e -> throwError $ PandocAppError $ + "Error parsing reader format " <> tshow s <> ": " <> tshow e Right (readerName, extsToEnable, extsToDisable) -> case lookup readerName readers of Nothing -> throwError $ PandocUnknownReaderError @@ -173,9 +176,13 @@ getReader s = return (r, exts) -- | Read pandoc document from JSON format. -readJSON :: PandocMonad m - => ReaderOptions -> Text -> m Pandoc -readJSON _ t = - case eitherDecode' . BL.fromStrict . UTF8.fromText $ t of +readJSON :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc +readJSON _ s = + case eitherDecode' . BL.fromStrict . UTF8.fromText + . sourcesToText . toSources $ s of Right doc -> return doc - Left e -> throwError $ PandocParseError ("JSON parse error: " <> T.pack e) + Left e -> throwError $ PandocParseError ("JSON parse error: " + <> T.pack e) diff --git a/src/Text/Pandoc/Readers/BibTeX.hs b/src/Text/Pandoc/Readers/BibTeX.hs index 6c96ab30a..318afda85 100644 --- a/src/Text/Pandoc/Readers/BibTeX.hs +++ b/src/Text/Pandoc/Readers/BibTeX.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.BibTeX - Copyright : Copyright (C) 2020 John MacFarlane + Copyright : Copyright (C) 2020-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -23,41 +23,47 @@ where import Text.Pandoc.Options import Text.Pandoc.Definition import Text.Pandoc.Builder (setMeta, cite, str) -import Data.Text (Text) import Citeproc (Lang(..), parseLang) import Citeproc.Locale (getLocale) import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Class (PandocMonad, lookupEnv) import Text.Pandoc.Citeproc.BibTeX as BibTeX import Text.Pandoc.Citeproc.MetaValue (referenceToMetaValue) +import Text.Pandoc.Sources (ToSources(..)) import Control.Monad.Except (throwError) -- | Read BibTeX from an input string and return a Pandoc document. -- The document will have only metadata, with an empty body. -- The metadata will contain a `references` field with the -- bibliography entries, and a `nocite` field with the wildcard `[@*]`. -readBibTeX :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readBibTeX :: (PandocMonad m, ToSources a) + => ReaderOptions -> a -> m Pandoc readBibTeX = readBibTeX' BibTeX.Bibtex -- | Read BibLaTeX from an input string and return a Pandoc document. -- The document will have only metadata, with an empty body. -- The metadata will contain a `references` field with the -- bibliography entries, and a `nocite` field with the wildcard `[@*]`. -readBibLaTeX :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readBibLaTeX :: (PandocMonad m, ToSources a) + => ReaderOptions -> a -> m Pandoc readBibLaTeX = readBibTeX' BibTeX.Biblatex -readBibTeX' :: PandocMonad m => Variant -> ReaderOptions -> Text -> m Pandoc +readBibTeX' :: (PandocMonad m, ToSources a) + => Variant -> ReaderOptions -> a -> m Pandoc readBibTeX' variant _opts t = do - lang <- maybe (Lang "en" (Just "US")) parseLang - <$> lookupEnv "LANG" + mblangEnv <- lookupEnv "LANG" + let defaultLang = Lang "en" Nothing (Just "US") [] [] [] + let lang = case mblangEnv of + Nothing -> defaultLang + Just l -> either (const defaultLang) id $ parseLang l locale <- case getLocale lang of Left e -> - case getLocale (Lang "en" (Just "US")) of + case getLocale (Lang "en" Nothing (Just "US") [] [] []) of Right l -> return l Left _ -> throwError $ PandocCiteprocError e Right l -> return l case BibTeX.readBibtexString variant locale (const True) t of - Left e -> throwError $ PandocParsecError t e + Left e -> throwError $ PandocParsecError (toSources t) e Right refs -> return $ setMeta "references" (map referenceToMetaValue refs) . setMeta "nocite" diff --git a/src/Text/Pandoc/Readers/CSV.hs b/src/Text/Pandoc/Readers/CSV.hs index f0edcaa16..eca8f9425 100644 --- a/src/Text/Pandoc/Readers/CSV.hs +++ b/src/Text/Pandoc/Readers/CSV.hs @@ -2,8 +2,8 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | - Module : Text.Pandoc.Readers.RST - Copyright : Copyright (C) 2006-2020 John MacFarlane + Module : Text.Pandoc.Readers.CSV + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -13,23 +13,23 @@ Conversion from CSV to a 'Pandoc' table. -} module Text.Pandoc.Readers.CSV ( readCSV ) where -import Data.Text (Text) import qualified Data.Text as T import Text.Pandoc.CSV (parseCSV, defaultCSVOptions) import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B import Text.Pandoc.Class (PandocMonad) -import Text.Pandoc.Shared (crFilter) import Text.Pandoc.Error +import Text.Pandoc.Sources (ToSources(..), sourcesToText) import Text.Pandoc.Options (ReaderOptions) import Control.Monad.Except (throwError) -readCSV :: PandocMonad m +readCSV :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ Text to parse (assuming @'\n'@ line endings) + -> a -> m Pandoc -readCSV _opts s = - case parseCSV defaultCSVOptions (crFilter s) of +readCSV _opts s = do + let txt = sourcesToText $ toSources s + case parseCSV defaultCSVOptions txt of Right (r:rs) -> return $ B.doc $ B.table capt (zip aligns widths) (TableHead nullAttr hdrs) @@ -45,4 +45,4 @@ readCSV _opts s = aligns = replicate numcols AlignDefault widths = replicate numcols ColWidthDefault Right [] -> return $ B.doc mempty - Left e -> throwError $ PandocParsecError s e + Left e -> throwError $ PandocParsecError (toSources [("",txt)]) e diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs index c1773eaab..411d64278 100644 --- a/src/Text/Pandoc/Readers/CommonMark.hs +++ b/src/Text/Pandoc/Readers/CommonMark.hs @@ -1,8 +1,9 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Readers.CommonMark - Copyright : Copyright (C) 2015-2020 John MacFarlane + Copyright : Copyright (C) 2015-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -25,17 +26,66 @@ import Text.Pandoc.Definition import Text.Pandoc.Builder as B import Text.Pandoc.Options import Text.Pandoc.Error +import Text.Pandoc.Readers.Metadata (yamlMetaBlock) import Control.Monad.Except import Data.Functor.Identity (runIdentity) +import Data.Typeable +import Text.Pandoc.Parsing (runParserT, getInput, + runF, defaultParserState, option, many1, anyChar, + Sources(..), ToSources(..), ParserT, Future, + sourceName) +import qualified Data.Text as T -- | Parse a CommonMark formatted string into a 'Pandoc' structure. -readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc -readCommonMark opts s = do - let res = runIdentity $ - commonmarkWith (foldr ($) defaultSyntaxSpec exts) "input" s - case res of - Left err -> throwError $ PandocParsecError s err - Right (Cm bls :: Cm () Blocks) -> return $ B.doc bls +readCommonMark :: (PandocMonad m, ToSources a) + => ReaderOptions -> a -> m Pandoc +readCommonMark opts s + | isEnabled Ext_yaml_metadata_block opts = do + let sources = toSources s + let toks = concatMap sourceToToks (unSources sources) + res <- runParserT (do meta <- yamlMetaBlock (metaValueParser opts) + rest <- getInput + return (meta, rest)) + defaultParserState "YAML metadata" (toSources s) + case res of + Left _ -> readCommonMarkBody opts sources toks + Right (meta, rest) -> do + -- strip off metadata section and parse body + let body = concatMap sourceToToks (unSources rest) + Pandoc _ bs <- readCommonMarkBody opts sources body + return $ Pandoc (runF meta defaultParserState) bs + | otherwise = do + let sources = toSources s + let toks = concatMap sourceToToks (unSources sources) + readCommonMarkBody opts sources toks + +sourceToToks :: (SourcePos, Text) -> [Tok] +sourceToToks (pos, s) = tokenize (sourceName pos) s + +metaValueParser :: Monad m + => ReaderOptions -> ParserT Sources st m (Future st MetaValue) +metaValueParser opts = do + inp <- option "" $ T.pack <$> many1 anyChar + let toks = concatMap sourceToToks (unSources (toSources inp)) + case runIdentity (parseCommonmarkWith (specFor opts) toks) of + Left _ -> mzero + Right (Cm bls :: Cm () Blocks) -> return $ return $ B.toMetaValue bls + +readCommonMarkBody :: PandocMonad m => ReaderOptions -> Sources -> [Tok] -> m Pandoc +readCommonMarkBody opts s toks + | isEnabled Ext_sourcepos opts = + case runIdentity (parseCommonmarkWith (specFor opts) toks) of + Left err -> throwError $ PandocParsecError s err + Right (Cm bls :: Cm SourceRange Blocks) -> return $ B.doc bls + | otherwise = + case runIdentity (parseCommonmarkWith (specFor opts) toks) of + Left err -> throwError $ PandocParsecError s err + Right (Cm bls :: Cm () Blocks) -> return $ B.doc bls + +specFor :: (Monad m, Typeable m, Typeable a, + Rangeable (Cm a Inlines), Rangeable (Cm a Blocks)) + => ReaderOptions -> SyntaxSpec m (Cm a Inlines) (Cm a Blocks) +specFor opts = foldr ($) defaultSyntaxSpec exts where exts = [ (hardLineBreaksSpec <>) | isEnabled Ext_hard_line_breaks opts ] ++ [ (smartPunctuationSpec <>) | isEnabled Ext_smart opts ] ++ @@ -62,5 +112,7 @@ readCommonMark opts s = do | isEnabled Ext_implicit_header_references opts ] ++ [ (footnoteSpec <>) | isEnabled Ext_footnotes opts ] ++ [ (definitionListSpec <>) | isEnabled Ext_definition_lists opts ] ++ - [ (taskListSpec <>) | isEnabled Ext_task_lists opts ] + [ (taskListSpec <>) | isEnabled Ext_task_lists opts ] ++ + [ (rebaseRelativePathsSpec <>) + | isEnabled Ext_rebase_relative_paths opts ] diff --git a/src/Text/Pandoc/Readers/Creole.hs b/src/Text/Pandoc/Readers/Creole.hs index 2658dfea2..ad848ada7 100644 --- a/src/Text/Pandoc/Readers/Creole.hs +++ b/src/Text/Pandoc/Readers/Creole.hs @@ -23,21 +23,20 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (enclosed) -import Text.Pandoc.Shared (crFilter) - -- | Read creole from an input string and return a Pandoc document. -readCreole :: PandocMonad m +readCreole :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc readCreole opts s = do - res <- readWithM parseCreole def{ stateOptions = opts } $ crFilter s <> "\n\n" + let sources = ensureFinalNewlines 2 (toSources s) + res <- readWithM parseCreole def{ stateOptions = opts } sources case res of Left e -> throwError e Right d -> return d -type CRLParser = ParserT Text ParserState +type CRLParser = ParserT Sources ParserState -- -- Utility functions diff --git a/src/Text/Pandoc/Readers/CslJson.hs b/src/Text/Pandoc/Readers/CslJson.hs index 377186b1e..a0af5c325 100644 --- a/src/Text/Pandoc/Readers/CslJson.hs +++ b/src/Text/Pandoc/Readers/CslJson.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.CslJson - Copyright : Copyright (C) 2020 John MacFarlane + Copyright : Copyright (C) 2020-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -24,21 +24,22 @@ import Text.Pandoc.Options import Text.Pandoc.Definition import Text.Pandoc.Builder (setMeta, cite, str) import qualified Text.Pandoc.UTF8 as UTF8 -import Data.Text (Text) import qualified Data.Text as T import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Class (PandocMonad) import Text.Pandoc.Citeproc.CslJson (cslJsonToReferences) import Text.Pandoc.Citeproc.MetaValue (referenceToMetaValue) import Control.Monad.Except (throwError) +import Text.Pandoc.Sources (ToSources(..), sourcesToText) -- | Read CSL JSON from an input string and return a Pandoc document. -- The document will have only metadata, with an empty body. -- The metadata will contain a `references` field with the -- bibliography entries, and a `nocite` field with the wildcard `[@*]`. -readCslJson :: PandocMonad m => ReaderOptions -> Text -> m Pandoc -readCslJson _opts t = - case cslJsonToReferences (UTF8.fromText t) of +readCslJson :: (PandocMonad m, ToSources a) + => ReaderOptions -> a -> m Pandoc +readCslJson _opts x = + case cslJsonToReferences (UTF8.fromText $ sourcesToText $ toSources x) of Left e -> throwError $ PandocParseError $ T.pack e Right refs -> return $ setMeta "references" (map referenceToMetaValue refs) diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index b0846e345..c49b82ccf 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.DocBook - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -12,23 +12,28 @@ Conversion of DocBook XML to 'Pandoc' document. -} module Text.Pandoc.Readers.DocBook ( readDocBook ) where import Control.Monad.State.Strict -import Data.Char (isSpace, toUpper) +import Data.Char (isSpace, isLetter) import Data.Default import Data.Either (rights) import Data.Foldable (asum) import Data.Generics import Data.List (intersperse,elemIndex) +import Data.List.NonEmpty (nonEmpty) import Data.Maybe (fromMaybe,mapMaybe) import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL +import Control.Monad.Except (throwError) import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad, report) import Text.Pandoc.Options import Text.Pandoc.Logging (LogMessage(..)) -import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces) +import Text.Pandoc.Shared (safeRead, extractSpaces) +import Text.Pandoc.Sources (ToSources(..), sourcesToText) import Text.TeXMath (readMathML, writeTeX) -import Text.XML.Light +import Text.Pandoc.XML.Light {- @@ -92,7 +97,7 @@ List of all DocBook tags, with [x] indicating implemented, [x] chapterinfo - Meta-information for a Chapter [ ] citation - An inline bibliographic reference to another published work [ ] citebiblioid - A citation of a bibliographic identifier -[ ] citerefentry - A citation to a reference page +[x] citerefentry - A citation to a reference page [ ] citetitle - The title of a cited work [ ] city - The name of a city in an address [x] classname - The name of a class, in the object-oriented programming sense @@ -129,6 +134,7 @@ List of all DocBook tags, with [x] indicating implemented, [ ] corpcredit - A corporation or organization credited in a document [ ] corpname - The name of a corporation [ ] country - The name of a country +[x] danger - An admonition set off from the text indicating hazardous situation [ ] database - The name of a database, or part of a database [x] date - The date of publication or revision of a document [ ] dedication - A wrapper for the dedication section of a book @@ -206,7 +212,7 @@ List of all DocBook tags, with [x] indicating implemented, [x] info - A wrapper for information about a component or other block. (DocBook v5) [x] informalequation - A displayed mathematical equation without a title [x] informalexample - A displayed example without a title -[ ] informalfigure - A untitled figure +[x] informalfigure - An untitled figure [ ] informaltable - A table without a title [ ] initializer - The initializer for a FieldSynopsis [x] inlineequation - A mathematical equation or expression occurring inline @@ -535,24 +541,32 @@ instance Default DBState where , dbContent = [] } -readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readDocBook :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc readDocBook _ inp = do - let tree = normalizeTree . parseXML . handleInstructions $ crFilter inp + let sources = toSources inp + tree <- either (throwError . PandocXMLError "") return $ + parseXMLContents + (TL.fromStrict . handleInstructions . sourcesToText $ sources) (bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree return $ Pandoc (dbMeta st') (toList . mconcat $ bs) --- We treat <?asciidoc-br?> specially (issue #1236), converting it --- to <br/>, since xml-light doesn't parse the instruction correctly. --- Other xml instructions are simply removed from the input stream. +-- We treat certain processing instructions by converting them to tags +-- beginning "pi-". handleInstructions :: Text -> Text -handleInstructions = T.pack . handleInstructions' . T.unpack - -handleInstructions' :: String -> String -handleInstructions' ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions' xs -handleInstructions' xs = case break (=='<') xs of - (ys, []) -> ys - ([], '<':zs) -> '<' : handleInstructions' zs - (ys, zs) -> ys ++ handleInstructions' zs +handleInstructions t = + let (x,y) = T.breakOn "<?" t + in if T.null y + then x + else + let (w,z) = T.breakOn "?>" y + in (if T.takeWhile (\c -> isLetter c || c == '-') + (T.drop 2 w) `elem` ["asciidoc-br", "dbfo"] + then x <> "<pi-" <> T.drop 2 w <> "/>" + else x <> w <> T.take 2 z) <> + handleInstructions (T.drop 2 z) getFigure :: PandocMonad m => Element -> DB m Blocks getFigure e = do @@ -565,32 +579,14 @@ getFigure e = do modify $ \st -> st{ dbFigureTitle = mempty, dbFigureId = mempty } return res --- normalize input, consolidating adjacent Text and CRef elements -normalizeTree :: [Content] -> [Content] -normalizeTree = everywhere (mkT go) - where go :: [Content] -> [Content] - go (Text (CData CDataRaw _ _):xs) = xs - go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs - go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs - go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs - go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs - go xs = xs - -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) - -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr elt = - maybe "" T.pack (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) + fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) -- convenience function named :: Text -> Element -> Bool -named s e = qName (elName e) == T.unpack s +named s e = qName (elName e) == s -- @@ -605,16 +601,24 @@ addMetadataFromElement e = do Nothing -> return () Just z -> addMetaField "author" z addMetaField "subtitle" e - addMetaField "author" e + addAuthor e addMetaField "date" e addMetaField "release" e addMetaField "releaseinfo" e return mempty - where addMetaField fieldname elt = - case filterChildren (named fieldname) elt of - [] -> return () - [z] -> getInlines z >>= addMeta fieldname - zs -> mapM getInlines zs >>= addMeta fieldname + where + addAuthor elt = + case filterChildren (named "author") elt of + [] -> return () + [z] -> fromAuthor z >>= addMeta "author" + zs -> mapM fromAuthor zs >>= addMeta "author" + fromAuthor elt = + mconcat . intersperse space <$> mapM getInlines (elChildren elt) + addMetaField fieldname elt = + case filterChildren (named fieldname) elt of + [] -> return () + [z] -> getInlines z >>= addMeta fieldname + zs -> mapM getInlines zs >>= addMeta fieldname addMeta :: PandocMonad m => ToMetaValue a => Text -> a -> DB m () addMeta field val = modify (setMeta field val) @@ -627,7 +631,7 @@ isBlockElement :: Content -> Bool isBlockElement (Elem e) = qName (elName e) `elem` blockTags isBlockElement _ = False -blockTags :: [String] +blockTags :: [Text] blockTags = [ "abstract" , "ackno" @@ -669,6 +673,7 @@ blockTags = , "index" , "info" , "informalexample" + , "informalfigure" , "informaltable" , "itemizedlist" , "linegroup" @@ -713,8 +718,8 @@ blockTags = , "variablelist" ] ++ admonitionTags -admonitionTags :: [String] -admonitionTags = ["important","caution","note","tip","warning"] +admonitionTags :: [Text] +admonitionTags = ["caution","danger","important","note","tip","warning"] -- Trim leading and trailing newline characters trimNl :: Text -> Text @@ -736,9 +741,9 @@ getMediaobject e = do figTitle <- gets dbFigureTitle ident <- gets dbFigureId (imageUrl, attr) <- - case filterChild (named "imageobject") e of - Nothing -> return (mempty, nullAttr) - Just z -> case filterChild (named "imagedata") z of + case filterElements (named "imageobject") e of + [] -> return (mempty, nullAttr) + (z:_) -> case filterChild (named "imagedata") z of Nothing -> return (mempty, nullAttr) Just i -> let atVal a = attrValue a i w = case atVal "width" of @@ -771,10 +776,10 @@ getBlocks e = mconcat <$> parseBlock :: PandocMonad m => Content -> DB m Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE -parseBlock (Text (CData _ s _)) = if all isSpace s +parseBlock (Text (CData _ s _)) = if T.all isSpace s then return mempty - else return $ plain $ trimInlines $ text $ T.pack s -parseBlock (CRef x) = return $ plain $ str $ T.toUpper $ T.pack x + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ T.toUpper x parseBlock (Elem e) = case qName (elName e) of "toc" -> skip -- skip TOC, since in pandoc it's autogenerated @@ -829,7 +834,7 @@ parseBlock (Elem e) = "refsect2" -> sect 2 "refsect3" -> sect 3 "refsection" -> gets dbSectionLevel >>= sect . (+1) - l | l `elem` admonitionTags -> parseAdmonition $ T.pack l + l | l `elem` admonitionTags -> parseAdmonition l "area" -> skip "areaset" -> skip "areaspec" -> skip @@ -855,6 +860,7 @@ parseBlock (Elem e) = "variablelist" -> definitionList <$> deflistitems "procedure" -> bulletList <$> steps "figure" -> getFigure e + "informalfigure" -> getFigure e "mediaobject" -> para <$> getMediaobject e "caption" -> skip "info" -> addMetadataFromElement e @@ -890,7 +896,11 @@ parseBlock (Elem e) = "subtitle" -> return mempty -- handled in parent element _ -> skip >> getBlocks e where skip = do - lift $ report $ IgnoredElement $ T.pack $ qName (elName e) + let qn = qName $ elName e + let name = if "pi-" `T.isPrefixOf` qn + then "<?" <> qn <> "?>" + else qn + lift $ report $ IgnoredElement name return mempty codeBlockWithLang = do @@ -898,7 +908,7 @@ parseBlock (Elem e) = "" -> [] x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) - $ trimNl $ T.pack $ strContentRecursive e + $ trimNl $ strContentRecursive e parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty @@ -952,17 +962,16 @@ parseBlock (Elem e) = w <- findAttr (unqual "colwidth") c n <- safeRead $ "0" <> T.filter (\x -> (x >= '0' && x <= '9') - || x == '.') (T.pack w) + || x == '.') w if n > 0 then Just n else Nothing - let numrows = case bodyrows of - [] -> 0 - xs -> maximum $ map length xs + let numrows = maybe 0 maximum $ nonEmpty + $ map length bodyrows let aligns = case colspecs of [] -> replicate numrows AlignDefault cs -> map toAlignment cs let parseWidth s = safeRead (T.filter (\x -> (x >= '0' && x <= '9') || x == '.') s) - let textWidth = case filterChild (named "?dbfo") e of + let textWidth = case filterChild (named "pi-dbfo") e of Just d -> case attrValue "table-width" d of "" -> 1.0 w -> fromMaybe 100.0 (parseWidth w) / 100.0 @@ -1035,12 +1044,12 @@ parseMixed container conts = do x <- parseMixed container rs return $ p <> b <> x -parseRow :: PandocMonad m => [String] -> Element -> DB m [Cell] +parseRow :: PandocMonad m => [Text] -> Element -> DB m [Cell] parseRow cn = do let isEntry x = named "entry" x || named "td" x || named "th" x mapM (parseEntry cn) . filterChildren isEntry -parseEntry :: PandocMonad m => [String] -> Element -> DB m Cell +parseEntry :: PandocMonad m => [Text] -> Element -> DB m Cell parseEntry cn el = do let colDistance sa ea = do let iStrt = elemIndex sa cn @@ -1062,7 +1071,7 @@ getInlines :: PandocMonad m => Element -> DB m Inlines getInlines e' = trimInlines . mconcat <$> mapM parseInline (elContent e') -strContentRecursive :: Element -> String +strContentRecursive :: Element -> Text strContentRecursive = strContent . (\e' -> e'{ elContent = map elementToStr $ elContent e' }) @@ -1071,16 +1080,16 @@ elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing elementToStr x = x parseInline :: PandocMonad m => Content -> DB m Inlines -parseInline (Text (CData _ s _)) = return $ text $ T.pack s +parseInline (Text (CData _ s _)) = return $ text s parseInline (CRef ref) = - return $ text $ maybe (T.toUpper $ T.pack ref) T.pack $ lookupEntity ref + return $ text $ maybe (T.toUpper ref) T.pack $ lookupEntity (T.unpack ref) parseInline (Elem e) = case qName (elName e) of "anchor" -> do return $ spanWith (attrValue "id" e, [], []) mempty "phrase" -> do let ident = attrValue "id" e - let classes = T.words $ attrValue "class" e + let classes = T.words $ attrValue "role" e if ident /= "" || classes /= [] then innerInlines (spanWith (ident,classes,[])) else innerInlines id @@ -1103,6 +1112,10 @@ parseInline (Elem e) = "segmentedlist" -> segmentedList "classname" -> codeWithLang "code" -> codeWithLang + "citerefentry" -> do + let title = maybe mempty strContent $ filterChild (named "refentrytitle") e + let manvolnum = maybe mempty (\el -> "(" <> strContent el <> ")") $ filterChild (named "manvolnum") e + return $ codeWith ("",["citerefentry"],[]) (title <> manvolnum) "filename" -> codeWithLang "envar" -> codeWithLang "literal" -> codeWithLang @@ -1125,7 +1138,7 @@ parseInline (Elem e) = "userinput" -> codeWithLang "systemitem" -> codeWithLang "varargs" -> return $ code "(...)" - "keycap" -> return (str $ T.pack $ strContent e) + "keycap" -> return (str $ strContent e) "keycombo" -> keycombo <$> mapM parseInline (elContent e) "menuchoice" -> menuchoice <$> @@ -1137,17 +1150,17 @@ parseInline (Elem e) = let title = case attrValue "endterm" e of "" -> maybe "???" xrefTitleByElem (findElementById linkend content) - endterm -> maybe "???" (T.pack . strContent) + endterm -> maybe "???" strContent (findElementById endterm content) return $ link ("#" <> linkend) "" (text title) - "email" -> return $ link ("mailto:" <> T.pack (strContent e)) "" - $ str $ T.pack $ strContent e - "uri" -> return $ link (T.pack $ strContent e) "" $ str $ T.pack $ strContent e + "email" -> return $ link ("mailto:" <> strContent e) "" + $ str $ strContent e + "uri" -> return $ link (strContent e) "" $ str $ strContent e "ulink" -> innerInlines (link (attrValue "url" e) "") "link" -> do ils <- innerInlines id let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of - Just h -> T.pack h + Just h -> h _ -> "#" <> attrValue "linkend" e let ils' = if ils == mempty then str href else ils let attr = (attrValue "id" e, T.words $ attrValue "role" e, []) @@ -1163,12 +1176,15 @@ parseInline (Elem e) = "title" -> return mempty "affiliation" -> skip -- Note: this isn't a real docbook tag; it's what we convert - -- <?asciidor-br?> to in handleInstructions, above. A kludge to - -- work around xml-light's inability to parse an instruction. - "br" -> return linebreak + -- <?asciidor-br?> to in handleInstructions, above. + "pi-asciidoc-br" -> return linebreak _ -> skip >> innerInlines id where skip = do - lift $ report $ IgnoredElement $ T.pack $ qName (elName e) + let qn = qName $ elName e + let name = if "pi-" `T.isPrefixOf` qn + then "<?" <> qn <> "?>" + else qn + lift $ report $ IgnoredElement name return mempty innerInlines f = extractSpaces f . mconcat <$> @@ -1177,7 +1193,7 @@ parseInline (Elem e) = let classes' = case attrValue "language" e of "" -> [] l -> [l] - return $ codeWith (attrValue "id" e,classes',[]) $ T.pack $ strContentRecursive e + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e simpleList = mconcat . intersperse (str "," <> space) <$> mapM getInlines (filterChildren (named "member") e) segmentedList = do @@ -1218,10 +1234,10 @@ parseInline (Elem e) = "sect5" -> descendantContent "title" el "cmdsynopsis" -> descendantContent "command" el "funcsynopsis" -> descendantContent "function" el - _ -> T.pack $ qName (elName el) ++ "_title" + _ -> qName (elName el) <> "_title" where xrefLabel = attrValue "xreflabel" el - descendantContent name = maybe "???" (T.pack . strContent) + descendantContent name = maybe "???" strContent . filterElementName (\n -> qName n == name) -- | Extract a math equation from an element @@ -1241,8 +1257,9 @@ equation e constructor = where mathMLEquations :: [Text] mathMLEquations = map writeTeX $ rights $ readMath - (\x -> qName (elName x) == "math" && qPrefix (elName x) == Just "mml") - (readMathML . T.pack . showElement) + (\x -> qName (elName x) == "math" && + qURI (elName x) == Just "http://www.w3.org/1998/Math/MathML") + (readMathML . showElement) latexEquations :: [Text] latexEquations = readMath (\x -> qName (elName x) == "mathphrase") @@ -1256,8 +1273,8 @@ equation e constructor = -- | Get the actual text stored in a CData block. 'showContent' -- returns the text still surrounded by the [[CDATA]] tags. showVerbatimCData :: Content -> Text -showVerbatimCData (Text (CData _ d _)) = T.pack d -showVerbatimCData c = T.pack $ showContent c +showVerbatimCData (Text (CData _ d _)) = d +showVerbatimCData c = showContent c -- | Set the prefix of a name to 'Nothing' diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 00de6a0cd..c06adf7e3 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -61,13 +61,14 @@ module Text.Pandoc.Readers.Docx import Codec.Archive.Zip import Control.Monad.Reader import Control.Monad.State.Strict +import Data.Bifunctor (bimap, first) import qualified Data.ByteString.Lazy as B import Data.Default (Default) -import Data.List (delete, intersect) +import Data.List (delete, intersect, foldl') import Data.Char (isSpace) import qualified Data.Map as M import qualified Data.Text as T -import Data.Maybe (isJust, fromMaybe) +import Data.Maybe (catMaybes, isJust, fromMaybe) import Data.Sequence (ViewL (..), viewl) import qualified Data.Sequence as Seq import qualified Data.Set as Set @@ -85,6 +86,7 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad) import qualified Text.Pandoc.Class.PandocMonad as P import Text.Pandoc.Error import Text.Pandoc.Logging +import Data.List.NonEmpty (nonEmpty) readDocx :: PandocMonad m => ReaderOptions @@ -112,6 +114,7 @@ data DState = DState { docxAnchorMap :: M.Map T.Text T.Text -- restarting , docxListState :: M.Map (T.Text, T.Text) Integer , docxPrevPara :: Inlines + , docxTableCaptions :: [Blocks] } instance Default DState where @@ -122,6 +125,7 @@ instance Default DState where , docxDropCap = mempty , docxListState = M.empty , docxPrevPara = mempty + , docxTableCaptions = [] } data DEnv = DEnv { docxOptions :: ReaderOptions @@ -490,15 +494,32 @@ singleParaToPlain blks singleton $ Plain ils singleParaToPlain blks = blks -cellToBlocks :: PandocMonad m => Docx.Cell -> DocxContext m Blocks -cellToBlocks (Docx.Cell bps) = do +cellToCell :: PandocMonad m => RowSpan -> Docx.Cell -> DocxContext m Pandoc.Cell +cellToCell rowSpan (Docx.Cell gridSpan _ bps) = do blks <- smushBlocks <$> mapM bodyPartToBlocks bps - return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks + let blks' = singleParaToPlain $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks + return (cell AlignDefault rowSpan (ColSpan (fromIntegral gridSpan)) blks') + +rowsToRows :: PandocMonad m => [Docx.Row] -> DocxContext m [Pandoc.Row] +rowsToRows rows = do + let rowspans = (fmap . fmap) (first RowSpan) (Docx.rowsToRowspans rows) + cells <- traverse (traverse (uncurry cellToCell)) rowspans + return (fmap (Pandoc.Row nullAttr) cells) + +splitHeaderRows :: Bool -> [Docx.Row] -> ([Docx.Row], [Docx.Row]) +splitHeaderRows hasFirstRowFormatting rs = bimap reverse reverse $ fst + $ if hasFirstRowFormatting + then foldl' f ((take 1 rs, []), True) (drop 1 rs) + else foldl' f (([], []), False) rs + where + f ((headerRows, bodyRows), previousRowWasHeader) r@(Docx.Row h cs) + | h == HasTblHeader || (previousRowWasHeader && any isContinuationCell cs) + = ((r : headerRows, bodyRows), True) + | otherwise + = ((headerRows, r : bodyRows), False) + + isContinuationCell (Docx.Cell _ vm _) = vm == Docx.Continue -rowToBlocksList :: PandocMonad m => Docx.Row -> DocxContext m [Blocks] -rowToBlocksList (Docx.Row cells) = do - blksList <- mapM cellToBlocks cells - return $ map singleParaToPlain blksList -- like trimInlines, but also take out linebreaks trimSps :: Inlines -> Inlines @@ -545,6 +566,11 @@ normalizeToClassName = T.map go . fromStyleName where go c | isSpace c = '-' | otherwise = c +bodyPartToTableCaption :: PandocMonad m => BodyPart -> DocxContext m (Maybe Blocks) +bodyPartToTableCaption (TblCaption pPr parparts) = + Just <$> bodyPartToBlocks (Paragraph pPr parparts) +bodyPartToTableCaption _ = pure Nothing + bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks bodyPartToBlocks (Paragraph pPr parparts) | Just True <- pBidi pPr = do @@ -636,54 +662,43 @@ bodyPartToBlocks (ListItem pPr _ _ _ parparts) = let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr} in bodyPartToBlocks $ Paragraph pPr' parparts +bodyPartToBlocks (TblCaption _ _) = + return $ para mempty -- collected separately bodyPartToBlocks (Tbl _ _ _ []) = return $ para mempty -bodyPartToBlocks (Tbl cap _ look parts@(r:rs)) = do - let cap' = simpleCaption $ plain $ text cap - (hdr, rows) = case firstRowFormatting look of - True | null rs -> (Nothing, [r]) - | otherwise -> (Just r, rs) - False -> (Nothing, r:rs) - - cells <- mapM rowToBlocksList rows +bodyPartToBlocks (Tbl cap grid look parts) = do + captions <- gets docxTableCaptions + fullCaption <- case captions of + c : cs -> do + modify (\s -> s { docxTableCaptions = cs }) + return c + [] -> return $ if T.null cap then mempty else plain (text cap) + let shortCaption = if T.null cap then Nothing else Just (toList (text cap)) + cap' = caption shortCaption fullCaption + (hdr, rows) = splitHeaderRows (firstRowFormatting look) parts let width = maybe 0 maximum $ nonEmpty $ map rowLength parts - -- Data.List.NonEmpty is not available with ghc 7.10 so we roll out - -- our own, see - -- https://github.com/jgm/pandoc/pull/4361#issuecomment-365416155 - nonEmpty [] = Nothing - nonEmpty l = Just l rowLength :: Docx.Row -> Int - rowLength (Docx.Row c) = length c + rowLength (Docx.Row _ c) = sum (fmap (\(Docx.Cell gridSpan _ _) -> fromIntegral gridSpan) c) - let toRow = Pandoc.Row nullAttr . map simpleCell - toHeaderRow l = [toRow l | not (null l)] + headerCells <- rowsToRows hdr + bodyCells <- rowsToRows rows - -- pad cells. New Text.Pandoc.Builder will do that for us, - -- so this is for compatibility while we switch over. - let cells' = map (\row -> toRow $ take width (row ++ repeat mempty)) cells - - hdrCells <- case hdr of - Just r' -> toHeaderRow <$> rowToBlocksList r' - Nothing -> return [] - - -- The two following variables (horizontal column alignment and - -- relative column widths) go to the default at the - -- moment. Width information is in the TblGrid field of the Tbl, - -- so should be possible. Alignment might be more difficult, - -- since there doesn't seem to be a column entity in docx. + -- Horizontal column alignment goes to the default at the moment. Getting + -- it might be difficult, since there doesn't seem to be a column entity + -- in docx. let alignments = replicate width AlignDefault - widths = replicate width ColWidthDefault + totalWidth = sum grid + widths = (\w -> ColWidth (fromInteger w / fromInteger totalWidth)) <$> grid return $ table cap' (zip alignments widths) - (TableHead nullAttr hdrCells) - [TableBody nullAttr 0 [] cells'] + (TableHead nullAttr headerCells) + [TableBody nullAttr 0 [] bodyCells] (TableFoot nullAttr []) bodyPartToBlocks (OMathPara e) = return $ para $ displayMath (writeTeX e) - -- replace targets with generated anchors. rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline rewriteLink' l@(Link attr ils (T.uncons -> Just ('#',target), title)) = do @@ -719,6 +734,8 @@ bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block]) bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps + captions <- catMaybes <$> mapM bodyPartToTableCaption blkbps + modify (\s -> s { docxTableCaptions = captions }) blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks blks'' <- removeOrphanAnchors blks' diff --git a/src/Text/Pandoc/Readers/Docx/Combine.hs b/src/Text/Pandoc/Readers/Docx/Combine.hs index 46112af19..6e4faa639 100644 --- a/src/Text/Pandoc/Readers/Docx/Combine.hs +++ b/src/Text/Pandoc/Readers/Docx/Combine.hs @@ -2,7 +2,7 @@ {- | Module : Text.Pandoc.Readers.Docx.Combine Copyright : © 2014-2020 Jesse Rosenthal <jrosenthal@jhu.edu>, - 2014-2020 John MacFarlane <jgm@berkeley.edu>, + 2014-2021 John MacFarlane <jgm@berkeley.edu>, 2020 Nikolay Yakimov <root@livid.pp.ru> License : GNU GPL, version 2 or above @@ -61,7 +61,7 @@ import Data.List import Data.Bifunctor import Data.Sequence ( ViewL (..), ViewR (..), viewl, viewr, spanr, spanl , (><), (|>) ) -import Text.Pandoc.Builder +import Text.Pandoc.Builder as B data Modifier a = Modifier (a -> a) | AttrModifier (Attr -> a -> a) Attr @@ -116,12 +116,12 @@ ilModifierAndInnards ils = case viewl $ unMany ils of inlinesL :: Inlines -> (Inlines, Inlines) inlinesL ils = case viewl $ unMany ils of - (s :< sq) -> (singleton s, Many sq) + (s :< sq) -> (B.singleton s, Many sq) _ -> (mempty, ils) inlinesR :: Inlines -> (Inlines, Inlines) inlinesR ils = case viewr $ unMany ils of - (sq :> s) -> (Many sq, singleton s) + (sq :> s) -> (Many sq, B.singleton s) _ -> (ils, mempty) combineInlines :: Inlines -> Inlines -> Inlines @@ -182,7 +182,7 @@ isAttrModifier _ = False smushInlines :: [Inlines] -> Inlines smushInlines xs = combineInlines xs' mempty - where xs' = foldl combineInlines mempty xs + where xs' = foldl' combineInlines mempty xs smushBlocks :: [Blocks] -> Blocks -smushBlocks xs = foldl combineBlocks mempty xs +smushBlocks xs = foldl' combineBlocks mempty xs diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index fdcffcc3f..dbb16a821 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -33,7 +33,9 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , ParStyle , CharStyle(cStyleData) , Row(..) + , TblHeader(..) , Cell(..) + , VMerge(..) , TrackedChange(..) , ChangeType(..) , ChangeInfo(..) @@ -50,6 +52,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , pHeading , constructBogusParStyleData , leftBiasedMergeRunStyle + , rowsToRowspans ) where import Text.Pandoc.Readers.Docx.Parse.Styles import Codec.Archive.Zip @@ -63,6 +66,7 @@ import Data.Char (chr, ord, readLitChar) import Data.List import qualified Data.Map as M import qualified Data.Text as T +import Data.Text (Text) import Data.Maybe import System.FilePath import Text.Pandoc.Readers.Docx.Util @@ -72,8 +76,7 @@ import qualified Text.Pandoc.UTF8 as UTF8 import Text.TeXMath (Exp) import Text.TeXMath.Readers.OMML (readOMML) import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, textToFont) -import Text.XML.Light -import qualified Text.XML.Light.Cursor as XMLC +import Text.Pandoc.XML.Light data ReaderEnv = ReaderEnv { envNotes :: Notes , envComments :: Comments @@ -127,37 +130,23 @@ mapD f xs = in concatMapM handler xs -unwrap :: NameSpaces -> Content -> [Content] -unwrap ns (Elem element) +unwrapElement :: NameSpaces -> Element -> [Element] +unwrapElement ns element | isElem ns "w" "sdt" element , Just sdtContent <- findChildByName ns "w" "sdtContent" element - = concatMap (unwrap ns . Elem) (elChildren sdtContent) + = concatMap (unwrapElement ns) (elChildren sdtContent) | isElem ns "w" "smartTag" element - = concatMap (unwrap ns . Elem) (elChildren element) -unwrap _ content = [content] + = concatMap (unwrapElement ns) (elChildren element) + | otherwise + = [element{ elContent = concatMap (unwrapContent ns) (elContent element) }] -unwrapChild :: NameSpaces -> Content -> Content -unwrapChild ns (Elem element) = - Elem $ element { elContent = concatMap (unwrap ns) (elContent element) } -unwrapChild _ content = content +unwrapContent :: NameSpaces -> Content -> [Content] +unwrapContent ns (Elem element) = map Elem $ unwrapElement ns element +unwrapContent _ content = [content] -walkDocument' :: NameSpaces -> XMLC.Cursor -> XMLC.Cursor -walkDocument' ns cur = - let modifiedCur = XMLC.modifyContent (unwrapChild ns) cur - in - case XMLC.nextDF modifiedCur of - Just cur' -> walkDocument' ns cur' - Nothing -> XMLC.root modifiedCur - -walkDocument :: NameSpaces -> Element -> Maybe Element +walkDocument :: NameSpaces -> Element -> Element walkDocument ns element = - let cur = XMLC.fromContent (Elem element) - cur' = walkDocument' ns cur - in - case XMLC.toTree cur' of - Elem element' -> Just element' - _ -> Nothing - + element{ elContent = concatMap (unwrapContent ns) (elContent element) } newtype Docx = Docx Document deriving Show @@ -239,6 +228,7 @@ defaultParagraphStyle = ParagraphStyle { pStyle = [] data BodyPart = Paragraph ParagraphStyle [ParPart] | ListItem ParagraphStyle T.Text T.Text (Maybe Level) [ParPart] | Tbl T.Text TblGrid TblLook [Row] + | TblCaption ParagraphStyle [ParPart] | OMathPara [Exp] deriving Show @@ -250,12 +240,61 @@ newtype TblLook = TblLook {firstRowFormatting::Bool} defaultTblLook :: TblLook defaultTblLook = TblLook{firstRowFormatting = False} -newtype Row = Row [Cell] - deriving Show +data Row = Row TblHeader [Cell] deriving Show -newtype Cell = Cell [BodyPart] +data TblHeader = HasTblHeader | NoTblHeader deriving (Show, Eq) + +data Cell = Cell GridSpan VMerge [BodyPart] deriving Show +type GridSpan = Integer + +data VMerge = Continue + -- ^ This cell should be merged with the one above it + | Restart + -- ^ This cell should not be merged with the one above it + deriving (Show, Eq) + +rowsToRowspans :: [Row] -> [[(Int, Cell)]] +rowsToRowspans rows = let + removeMergedCells = fmap (filter (\(_, Cell _ vmerge _) -> vmerge == Restart)) + in removeMergedCells (foldr f [] rows) + where + f :: Row -> [[(Int, Cell)]] -> [[(Int, Cell)]] + f (Row _ cells) acc = let + spans = g cells Nothing (listToMaybe acc) + in spans : acc + + g :: + -- | The current row + [Cell] -> + -- | Number of columns left below + Maybe Integer -> + -- | (rowspan so far, cell) for the row below this one + Maybe [(Int, Cell)] -> + -- | (rowspan so far, cell) for this row + [(Int, Cell)] + g cells _ Nothing = zip (repeat 1) cells + g cells columnsLeftBelow (Just rowBelow) = + case cells of + [] -> [] + thisCell@(Cell thisGridSpan _ _) : restOfRow -> case rowBelow of + [] -> zip (repeat 1) cells + (spanSoFarBelow, Cell gridSpanBelow vmerge _) : _ -> + let spanSoFar = case vmerge of + Restart -> 1 + Continue -> 1 + spanSoFarBelow + columnsToDrop = thisGridSpan + (gridSpanBelow - fromMaybe gridSpanBelow columnsLeftBelow) + (newColumnsLeftBelow, restOfRowBelow) = dropColumns columnsToDrop rowBelow + in (spanSoFar, thisCell) : g restOfRow (Just newColumnsLeftBelow) (Just restOfRowBelow) + + dropColumns :: Integer -> [(a, Cell)] -> (Integer, [(a, Cell)]) + dropColumns n [] = (n, []) + dropColumns n cells@((_, Cell gridSpan _ _) : otherCells) = + if n < gridSpan + then (gridSpan - n, cells) + else dropColumns (n - gridSpan) otherCells + leftBiasedMergeRunStyle :: RunStyle -> RunStyle -> RunStyle leftBiasedMergeRunStyle a b = RunStyle { isBold = isBold a <|> isBold b @@ -343,10 +382,16 @@ archiveToDocxWithWarnings archive = do Right doc -> Right (Docx doc, stateWarnings st) Left e -> Left e +parseXMLFromEntry :: Entry -> Maybe Element +parseXMLFromEntry entry = + case parseXMLElement (UTF8.toTextLazy (fromEntry entry)) of + Left _ -> Nothing + Right el -> Just el + getDocumentXmlPath :: Archive -> Maybe FilePath getDocumentXmlPath zf = do entry <- findEntryByPath "_rels/.rels" zf - relsElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + relsElem <- parseXMLFromEntry entry let rels = filterChildrenName (\n -> qName n == "Relationship") relsElem rel <- find (\e -> findAttr (QName "Type" Nothing Nothing) e == Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument") @@ -354,18 +399,18 @@ getDocumentXmlPath zf = do fp <- findAttr (QName "Target" Nothing Nothing) rel -- sometimes there will be a leading slash, which windows seems to -- have trouble with. - return $ case fp of + return $ case T.unpack fp of '/' : fp' -> fp' - _ -> fp + fp' -> fp' archiveToDocument :: Archive -> D Document archiveToDocument zf = do docPath <- asks envDocXmlPath entry <- maybeToD $ findEntryByPath docPath zf - docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + docElem <- maybeToD $ parseXMLFromEntry entry let namespaces = elemToNameSpaces docElem bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem - let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem) + let bodyElem' = walkDocument namespaces bodyElem body <- elemToBody namespaces bodyElem' return $ Document namespaces body @@ -401,23 +446,24 @@ constructBogusParStyleData stName = ParStyle archiveToNotes :: Archive -> Notes archiveToNotes zf = let fnElem = findEntryByPath "word/footnotes.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + >>= parseXMLFromEntry enElem = findEntryByPath "word/endnotes.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) - fn_namespaces = maybe [] elemToNameSpaces fnElem - en_namespaces = maybe [] elemToNameSpaces enElem - ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces - fn = fnElem >>= walkDocument ns >>= elemToNotes ns "footnote" - en = enElem >>= walkDocument ns >>= elemToNotes ns "endnote" + >>= parseXMLFromEntry + fn_namespaces = maybe mempty elemToNameSpaces fnElem + en_namespaces = maybe mempty elemToNameSpaces enElem + ns = M.union fn_namespaces en_namespaces + fn = fnElem >>= elemToNotes ns "footnote" . walkDocument ns + en = enElem >>= elemToNotes ns "endnote" . walkDocument ns in Notes ns fn en archiveToComments :: Archive -> Comments archiveToComments zf = let cmtsElem = findEntryByPath "word/comments.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) - cmts_namespaces = maybe [] elemToNameSpaces cmtsElem - cmts = elemToComments cmts_namespaces <$> (cmtsElem >>= walkDocument cmts_namespaces) + >>= parseXMLFromEntry + cmts_namespaces = maybe mempty elemToNameSpaces cmtsElem + cmts = elemToComments cmts_namespaces . walkDocument cmts_namespaces <$> + cmtsElem in case cmts of Just c -> Comments cmts_namespaces c @@ -433,20 +479,26 @@ filePathToRelType path docXmlPath = then Just InDocument else Nothing -relElemToRelationship :: DocumentLocation -> Element -> Maybe Relationship -relElemToRelationship relType element | qName (elName element) == "Relationship" = +relElemToRelationship :: FilePath -> DocumentLocation -> Element + -> Maybe Relationship +relElemToRelationship fp relType element | qName (elName element) == "Relationship" = do - relId <- findAttrText (QName "Id" Nothing Nothing) element - target <- findAttrText (QName "Target" Nothing Nothing) element - return $ Relationship relType relId target -relElemToRelationship _ _ = Nothing + relId <- findAttr (QName "Id" Nothing Nothing) element + target <- findAttr (QName "Target" Nothing Nothing) element + -- target may be relative (media/image1.jpeg) or absolute + -- (/word/media/image1.jpeg); we need to relativize it (see #7374) + let frontOfFp = T.pack $ takeWhile (/= '_') fp + let target' = fromMaybe target $ + T.stripPrefix frontOfFp $ T.dropWhile (== '/') target + return $ Relationship relType relId target' +relElemToRelationship _ _ _ = Nothing filePathToRelationships :: Archive -> FilePath -> FilePath -> [Relationship] filePathToRelationships ar docXmlPath fp | Just relType <- filePathToRelType fp docXmlPath , Just entry <- findEntryByPath fp ar - , Just relElems <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry = - mapMaybe (relElemToRelationship relType) $ elChildren relElems + , Just relElems <- parseXMLFromEntry entry = + mapMaybe (relElemToRelationship fp relType) $ elChildren relElems filePathToRelationships _ _ _ = [] archiveToRelationships :: Archive -> FilePath -> [Relationship] @@ -478,10 +530,10 @@ lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do loElemToLevelOverride :: NameSpaces -> Element -> Maybe LevelOverride loElemToLevelOverride ns element | isElem ns "w" "lvlOverride" element = do - ilvl <- findAttrTextByName ns "w" "ilvl" element + ilvl <- findAttrByName ns "w" "ilvl" element let startOverride = findChildByName ns "w" "startOverride" element >>= findAttrByName ns "w" "val" - >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + >>= stringToInteger lvl = findChildByName ns "w" "lvl" element >>= levelElemToLevel ns return $ LevelOverride ilvl startOverride lvl @@ -490,9 +542,9 @@ loElemToLevelOverride _ _ = Nothing numElemToNum :: NameSpaces -> Element -> Maybe Numb numElemToNum ns element | isElem ns "w" "num" element = do - numId <- findAttrTextByName ns "w" "numId" element + numId <- findAttrByName ns "w" "numId" element absNumId <- findChildByName ns "w" "abstractNumId" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" let lvlOverrides = mapMaybe (loElemToLevelOverride ns) (findChildrenByName ns "w" "lvlOverride" element) @@ -502,7 +554,7 @@ numElemToNum _ _ = Nothing absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb absNumElemToAbsNum ns element | isElem ns "w" "abstractNum" element = do - absNumId <- findAttrTextByName ns "w" "abstractNumId" element + absNumId <- findAttrByName ns "w" "abstractNumId" element let levelElems = findChildrenByName ns "w" "lvl" element levels = mapMaybe (levelElemToLevel ns) levelElems return $ AbstractNumb absNumId levels @@ -511,23 +563,23 @@ absNumElemToAbsNum _ _ = Nothing levelElemToLevel :: NameSpaces -> Element -> Maybe Level levelElemToLevel ns element | isElem ns "w" "lvl" element = do - ilvl <- findAttrTextByName ns "w" "ilvl" element + ilvl <- findAttrByName ns "w" "ilvl" element fmt <- findChildByName ns "w" "numFmt" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" txt <- findChildByName ns "w" "lvlText" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" let start = findChildByName ns "w" "start" element >>= findAttrByName ns "w" "val" - >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + >>= stringToInteger return (Level ilvl fmt txt start) levelElemToLevel _ _ = Nothing archiveToNumbering' :: Archive -> Maybe Numbering archiveToNumbering' zf = case findEntryByPath "word/numbering.xml" zf of - Nothing -> Just $ Numbering [] [] [] + Nothing -> Just $ Numbering mempty [] [] Just entry -> do - numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + numberingElem <- parseXMLFromEntry entry let namespaces = elemToNameSpaces numberingElem numElems = findChildrenByName namespaces "w" "num" numberingElem absNumElems = findChildrenByName namespaces "w" "abstractNum" numberingElem @@ -537,13 +589,13 @@ archiveToNumbering' zf = archiveToNumbering :: Archive -> Numbering archiveToNumbering archive = - fromMaybe (Numbering [] [] []) (archiveToNumbering' archive) + fromMaybe (Numbering mempty [] []) (archiveToNumbering' archive) -elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map T.Text Element) +elemToNotes :: NameSpaces -> Text -> Element -> Maybe (M.Map T.Text Element) elemToNotes ns notetype element | isElem ns "w" (notetype <> "s") element = let pairs = mapMaybe - (\e -> findAttrTextByName ns "w" "id" e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) (findChildrenByName ns "w" notetype element) in @@ -555,7 +607,7 @@ elemToComments :: NameSpaces -> Element -> M.Map T.Text Element elemToComments ns element | isElem ns "w" "comments" element = let pairs = mapMaybe - (\e -> findAttrTextByName ns "w" "id" e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) (findChildrenByName ns "w" "comment" element) in @@ -570,7 +622,7 @@ elemToTblGrid :: NameSpaces -> Element -> D TblGrid elemToTblGrid ns element | isElem ns "w" "tblGrid" element = let cols = findChildrenByName ns "w" "gridCol" element in - mapD (\e -> maybeToD (findAttrByName ns "w" "val" e >>= stringToInteger)) + mapD (\e -> maybeToD (findAttrByName ns "w" "w" e >>= stringToInteger)) cols elemToTblGrid _ _ = throwError WrongElem @@ -594,14 +646,31 @@ elemToRow ns element | isElem ns "w" "tr" element = do let cellElems = findChildrenByName ns "w" "tc" element cells <- mapD (elemToCell ns) cellElems - return $ Row cells + let hasTblHeader = maybe NoTblHeader (const HasTblHeader) + (findChildByName ns "w" "trPr" element + >>= findChildByName ns "w" "tblHeader") + return $ Row hasTblHeader cells elemToRow _ _ = throwError WrongElem elemToCell :: NameSpaces -> Element -> D Cell elemToCell ns element | isElem ns "w" "tc" element = do + let properties = findChildByName ns "w" "tcPr" element + let gridSpan = properties + >>= findChildByName ns "w" "gridSpan" + >>= findAttrByName ns "w" "val" + >>= stringToInteger + let vMerge = case properties >>= findChildByName ns "w" "vMerge" of + Nothing -> Restart + Just e -> + fromMaybe Continue $ do + s <- findAttrByName ns "w" "val" e + case s of + "continue" -> Just Continue + "restart" -> Just Restart + _ -> Nothing cellContents <- mapD (elemToBodyPart ns) (elChildren element) - return $ Cell cellContents + return $ Cell (fromMaybe 1 gridSpan) vMerge cellContents elemToCell _ _ = throwError WrongElem elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation @@ -615,12 +684,12 @@ elemToParIndentation ns element | isElem ns "w" "ind" element = stringToInteger , hangingParIndent = findAttrByName ns "w" "hanging" element >>= - stringToInteger} + stringToInteger } elemToParIndentation _ _ = Nothing -testBitMask :: String -> Int -> Bool +testBitMask :: Text -> Int -> Bool testBitMask bitMaskS n = - case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of + case (reads ("0x" ++ T.unpack bitMaskS) :: [(Int, String)]) of [] -> False ((n', _) : _) -> (n' .|. n) /= 0 @@ -633,10 +702,9 @@ pNumInfo = getParStyleField numInfo . pStyle elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "w" "p" element - , (c:_) <- findChildrenByName ns "m" "oMathPara" element = - do - expsLst <- eitherToD $ readOMML $ T.pack $ showElement c - return $ OMathPara expsLst + , (c:_) <- findChildrenByName ns "m" "oMathPara" element = do + expsLst <- eitherToD $ readOMML $ showElement c + return $ OMathPara expsLst elemToBodyPart ns element | isElem ns "w" "p" element , Just (numId, lvl) <- getNumInfo ns element = do @@ -654,13 +722,31 @@ elemToBodyPart ns element Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do levelInfo <- lookupLevel numId lvl <$> asks envNumbering return $ ListItem parstyle numId lvl levelInfo parparts - _ -> return $ Paragraph parstyle parparts + _ -> let + hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle) + + hasSimpleTableField = fromMaybe False $ do + fldSimple <- findChildByName ns "w" "fldSimple" element + instr <- findAttrByName ns "w" "instr" fldSimple + pure ("Table" `elem` T.words instr) + + hasComplexTableField = fromMaybe False $ do + instrText <- findElementByName ns "w" "instrText" element + pure ("Table" `elem` T.words (strContent instrText)) + + in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField) + then return $ TblCaption parstyle parparts + else return $ Paragraph parstyle parparts + elemToBodyPart ns element | isElem ns "w" "tbl" element = do - let caption' = findChildByName ns "w" "tblPr" element + let tblProperties = findChildByName ns "w" "tblPr" element + caption = fromMaybe "" $ tblProperties >>= findChildByName ns "w" "tblCaption" - >>= findAttrTextByName ns "w" "val" - caption = fromMaybe "" caption' + >>= findAttrByName ns "w" "val" + description = fromMaybe "" $ tblProperties + >>= findChildByName ns "w" "tblDescription" + >>= findAttrByName ns "w" "val" grid' = case findChildByName ns "w" "tblGrid" element of Just g -> elemToTblGrid ns g Nothing -> return [] @@ -673,7 +759,7 @@ elemToBodyPart ns element grid <- grid' tblLook <- tblLook' rows <- mapD (elemToRow ns) (elChildren element) - return $ Tbl caption grid tblLook rows + return $ Tbl (caption <> description) grid tblLook rows elemToBodyPart _ _ = throwError WrongElem lookupRelationship :: DocumentLocation -> RelId -> [Relationship] -> Maybe Target @@ -698,8 +784,8 @@ getTitleAndAlt :: NameSpaces -> Element -> (T.Text, T.Text) getTitleAndAlt ns element = let mbDocPr = findChildByName ns "wp" "inline" element >>= findChildByName ns "wp" "docPr" - title = fromMaybe "" (mbDocPr >>= findAttrTextByName ns "" "title") - alt = fromMaybe "" (mbDocPr >>= findAttrTextByName ns "" "descr") + title = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "title") + alt = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "descr") in (title, alt) elemToParPart :: NameSpaces -> Element -> D ParPart @@ -711,22 +797,29 @@ elemToParPart ns element = let (title, alt) = getTitleAndAlt ns drawingElem a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrTextByName ns "r" "embed" + >>= findAttrByName ns "r" "embed" in case drawing of Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) Nothing -> throwError WrongElem --- The below is an attempt to deal with images in deprecated vml format. +-- The two cases below are an attempt to deal with images in deprecated vml format. +-- Todo: check out title and attr for deprecated format. elemToParPart ns element | isElem ns "w" "r" element , Just _ <- findChildByName ns "w" "pict" element = let drawing = findElement (elemName ns "v" "imagedata") element - >>= findAttrTextByName ns "r" "id" + >>= findAttrByName ns "r" "id" in case drawing of - -- Todo: check out title and attr for deprecated format. Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) Nothing -> throwError WrongElem +elemToParPart ns element + | isElem ns "w" "r" element + , Just objectElem <- findChildByName ns "w" "object" element + , Just shapeElem <- findChildByName ns "v" "shape" objectElem + , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem + , Just drawingId <- findAttrByName ns "r" "id" imagedataElem + = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) -- Chart elemToParPart ns element | isElem ns "w" "r" element @@ -790,7 +883,7 @@ elemToParPart ns element fldCharState <- gets stateFldCharState case fldCharState of FldCharOpen -> do - info <- eitherToD $ parseFieldInfo $ T.pack $ strContent instrText + info <- eitherToD $ parseFieldInfo $ strContent instrText modify $ \st -> st{stateFldCharState = FldCharFieldInfo info} return NullParPart _ -> return NullParPart @@ -811,48 +904,48 @@ elemToParPart ns element return $ ChangedRuns change runs elemToParPart ns element | isElem ns "w" "bookmarkStart" element - , Just bmId <- findAttrTextByName ns "w" "id" element - , Just bmName <- findAttrTextByName ns "w" "name" element = + , Just bmId <- findAttrByName ns "w" "id" element + , Just bmName <- findAttrByName ns "w" "name" element = return $ BookMark bmId bmName elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just relId <- findAttrTextByName ns "r" "id" element = do + , Just relId <- findAttrByName ns "r" "id" element = do location <- asks envLocation runs <- mapD (elemToRun ns) (elChildren element) rels <- asks envRelationships case lookupRelationship location relId rels of Just target -> - case findAttrTextByName ns "w" "anchor" element of + case findAttrByName ns "w" "anchor" element of Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) runs Nothing -> return $ ExternalHyperLink target runs Nothing -> return $ ExternalHyperLink "" runs elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just anchor <- findAttrTextByName ns "w" "anchor" element = do + , Just anchor <- findAttrByName ns "w" "anchor" element = do runs <- mapD (elemToRun ns) (elChildren element) return $ InternalHyperLink anchor runs elemToParPart ns element | isElem ns "w" "commentRangeStart" element - , Just cmtId <- findAttrTextByName ns "w" "id" element = do + , Just cmtId <- findAttrByName ns "w" "id" element = do (Comments _ commentMap) <- asks envComments case M.lookup cmtId commentMap of Just cmtElem -> elemToCommentStart ns cmtElem Nothing -> throwError WrongElem elemToParPart ns element | isElem ns "w" "commentRangeEnd" element - , Just cmtId <- findAttrTextByName ns "w" "id" element = + , Just cmtId <- findAttrByName ns "w" "id" element = return $ CommentEnd cmtId elemToParPart ns element | isElem ns "m" "oMath" element = - fmap PlainOMath (eitherToD $ readOMML $ T.pack $ showElement element) + fmap PlainOMath (eitherToD $ readOMML $ showElement element) elemToParPart _ _ = throwError WrongElem elemToCommentStart :: NameSpaces -> Element -> D ParPart elemToCommentStart ns element | isElem ns "w" "comment" element - , Just cmtId <- findAttrTextByName ns "w" "id" element - , Just cmtAuthor <- findAttrTextByName ns "w" "author" element - , cmtDate <- findAttrTextByName ns "w" "date" element = do + , Just cmtId <- findAttrByName ns "w" "id" element + , Just cmtAuthor <- findAttrByName ns "w" "author" element + , cmtDate <- findAttrByName ns "w" "date" element = do bps <- mapD (elemToBodyPart ns) (elChildren element) return $ CommentStart cmtId cmtAuthor cmtDate bps elemToCommentStart _ _ = throwError WrongElem @@ -871,7 +964,7 @@ elemToExtent drawingElem = where wp_ns = "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem - >>= findAttr (QName at Nothing Nothing) >>= safeRead . T.pack + >>= findAttr (QName at Nothing Nothing) >>= safeRead childElemToRun :: NameSpaces -> Element -> D Run @@ -882,7 +975,7 @@ childElemToRun ns element = let (title, alt) = getTitleAndAlt ns element a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrText (QName "embed" (lookup "r" ns) (Just "r")) + >>= findAttr (QName "embed" (M.lookup "r" ns) (Just "r")) in case drawing of Just s -> expandDrawingId s >>= @@ -895,7 +988,7 @@ childElemToRun ns element = return InlineChart childElemToRun ns element | isElem ns "w" "footnoteReference" element - , Just fnId <- findAttrTextByName ns "w" "id" element = do + , Just fnId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupFootnote fnId notes of Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -903,7 +996,7 @@ childElemToRun ns element Nothing -> return $ Footnote [] childElemToRun ns element | isElem ns "w" "endnoteReference" element - , Just enId <- findAttrTextByName ns "w" "id" element = do + , Just enId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupEndnote enId notes of Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -956,15 +1049,15 @@ getParStyleField _ _ = Nothing getTrackedChange :: NameSpaces -> Element -> Maybe TrackedChange getTrackedChange ns element | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element - , Just cId <- findAttrTextByName ns "w" "id" element - , Just cAuthor <- findAttrTextByName ns "w" "author" element - , mcDate <- findAttrTextByName ns "w" "date" element = + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , mcDate <- findAttrByName ns "w" "date" element = Just $ TrackedChange Insertion (ChangeInfo cId cAuthor mcDate) getTrackedChange ns element | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element - , Just cId <- findAttrTextByName ns "w" "id" element - , Just cAuthor <- findAttrTextByName ns "w" "author" element - , mcDate <- findAttrTextByName ns "w" "date" element = + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , mcDate <- findAttrByName ns "w" "date" element = Just $ TrackedChange Deletion (ChangeInfo cId cAuthor mcDate) getTrackedChange _ _ = Nothing @@ -973,7 +1066,7 @@ elemToParagraphStyle ns element sty | Just pPr <- findChildByName ns "w" "pPr" element = let style = mapMaybe - (fmap ParaStyleId . findAttrTextByName ns "w" "val") + (fmap ParaStyleId . findAttrByName ns "w" "val") (findChildrenByName ns "w" "pStyle" pPr) in ParagraphStyle {pStyle = mapMaybe (`M.lookup` sty) style @@ -1005,7 +1098,7 @@ elemToRunStyleD ns element charStyles <- asks envCharStyles let parentSty = findChildByName ns "w" "rStyle" rPr >>= - findAttrTextByName ns "w" "val" >>= + findAttrByName ns "w" "val" >>= flip M.lookup charStyles . CharStyleId return $ elemToRunStyle ns element parentSty elemToRunStyleD _ _ = return defaultRunStyle @@ -1015,7 +1108,7 @@ elemToRunElem ns element | isElem ns "w" "t" element || isElem ns "w" "delText" element || isElem ns "m" "t" element = do - let str = T.pack $ strContent element + let str = strContent element font <- asks envFont case font of Nothing -> return $ TextRun str @@ -1037,14 +1130,14 @@ getSymChar :: NameSpaces -> Element -> RunElem getSymChar ns element | Just s <- lowerFromPrivate <$> getCodepoint , Just font <- getFont = - case readLitChar ("\\x" ++ s) of + case readLitChar ("\\x" ++ T.unpack s) of [(char, _)] -> TextRun . maybe "" T.singleton $ getUnicode font char _ -> TextRun "" where getCodepoint = findAttrByName ns "w" "char" element - getFont = textToFont . T.pack =<< findAttrByName ns "w" "font" element - lowerFromPrivate ('F':xs) = '0':xs - lowerFromPrivate xs = xs + getFont = textToFont =<< findAttrByName ns "w" "font" element + lowerFromPrivate t | "F" `T.isPrefixOf` t = "0" <> T.drop 1 t + | otherwise = t getSymChar _ _ = TextRun "" elemToRunElems :: NameSpaces -> Element -> D [RunElem] @@ -1054,8 +1147,9 @@ elemToRunElems ns element let qualName = elemName ns "w" let font = do fontElem <- findElement (qualName "rFonts") element - textToFont . T.pack =<< - foldr ((<|>) . (flip findAttr fontElem . qualName)) Nothing ["ascii", "hAnsi"] + textToFont =<< + foldr ((<|>) . (flip findAttr fontElem . qualName)) + Nothing ["ascii", "hAnsi"] local (setFont font) (mapD (elemToRunElem ns) (elChildren element)) elemToRunElems _ _ = throwError WrongElem diff --git a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs index 236167187..0d7271d6a 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs @@ -48,11 +48,13 @@ import Data.Function (on) import Data.String (IsString(..)) import qualified Data.Map as M import qualified Data.Text as T +import qualified Data.Text.Read +import Data.Text (Text) import Data.Maybe import Data.Coerce import Text.Pandoc.Readers.Docx.Util import qualified Text.Pandoc.UTF8 as UTF8 -import Text.XML.Light +import Text.Pandoc.XML.Light newtype CharStyleId = CharStyleId T.Text deriving (Show, Eq, Ord, IsString, FromStyleId) @@ -108,7 +110,7 @@ data RunStyle = RunStyle { isBold :: Maybe Bool , isRTL :: Maybe Bool , isForceCTL :: Maybe Bool , rVertAlign :: Maybe VertAlign - , rUnderline :: Maybe String + , rUnderline :: Maybe Text , rParentStyle :: Maybe CharStyle } deriving Show @@ -135,19 +137,22 @@ defaultRunStyle = RunStyle { isBold = Nothing , rParentStyle = Nothing } -archiveToStyles' :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) => - (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2) +archiveToStyles' + :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) + => (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2) archiveToStyles' conv1 conv2 zf = - let stylesElem = findEntryByPath "word/styles.xml" zf >>= - (parseXMLDoc . UTF8.toStringLazy . fromEntry) - in - case stylesElem of - Nothing -> (M.empty, M.empty) - Just styElem -> - let namespaces = elemToNameSpaces styElem - in - ( M.fromList $ map (\r -> (conv1 r, r)) $ buildBasedOnList namespaces styElem Nothing, - M.fromList $ map (\p -> (conv2 p, p)) $ buildBasedOnList namespaces styElem Nothing) + case findEntryByPath "word/styles.xml" zf of + Nothing -> (M.empty, M.empty) + Just entry -> + case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of + Left _ -> (M.empty, M.empty) + Right styElem -> + let namespaces = elemToNameSpaces styElem + in + ( M.fromList $ map (\r -> (conv1 r, r)) $ + buildBasedOnList namespaces styElem Nothing, + M.fromList $ map (\p -> (conv2 p, p)) $ + buildBasedOnList namespaces styElem Nothing) isBasedOnStyle :: (ElemToStyle a, FromStyleId (StyleId a)) => NameSpaces -> Element -> Maybe a -> Bool isBasedOnStyle ns element parentStyle @@ -155,7 +160,7 @@ isBasedOnStyle ns element parentStyle , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle , Just basedOnVal <- findChildByName ns "w" "basedOn" element >>= - findAttrTextByName ns "w" "val" + findAttrByName ns "w" "val" , Just ps <- parentStyle = basedOnVal == fromStyleId (getStyleId ps) | isElem ns "w" "style" element , Just styleType <- findAttrByName ns "w" "type" element @@ -165,7 +170,7 @@ isBasedOnStyle ns element parentStyle | otherwise = False class HasStyleId a => ElemToStyle a where - cStyleType :: Maybe a -> String + cStyleType :: Maybe a -> Text elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a class FromStyleId (StyleId a) => HasStyleId a where @@ -222,8 +227,10 @@ buildBasedOnList ns element rootStyle = stys -> stys ++ concatMap (buildBasedOnList ns element . Just) stys -stringToInteger :: String -> Maybe Integer -stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) +stringToInteger :: Text -> Maybe Integer +stringToInteger s = case Data.Text.Read.decimal s of + Right (x,_) -> Just x + Left _ -> Nothing checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool checkOnOff ns rPr tag @@ -243,7 +250,7 @@ checkOnOff _ _ _ = Nothing elemToCharStyle :: NameSpaces -> Element -> Maybe CharStyle -> Maybe CharStyle elemToCharStyle ns element parentStyle - = CharStyle <$> (CharStyleId <$> findAttrTextByName ns "w" "styleId" element) + = CharStyle <$> (CharStyleId <$> findAttrByName ns "w" "styleId" element) <*> getElementStyleName ns element <*> Just (elemToRunStyle ns element parentStyle) @@ -277,7 +284,7 @@ elemToRunStyle _ _ _ = defaultRunStyle getHeaderLevel :: NameSpaces -> Element -> Maybe (ParaStyleName, Int) getHeaderLevel ns element | Just styleName <- getElementStyleName ns element - , Just n <- stringToInteger . T.unpack =<< + , Just n <- stringToInteger =<< (T.stripPrefix "heading " . T.toLower $ fromStyleName styleName) , n > 0 = Just (styleName, fromInteger n) @@ -285,8 +292,8 @@ getHeaderLevel _ _ = Nothing getElementStyleName :: Coercible T.Text a => NameSpaces -> Element -> Maybe a getElementStyleName ns el = coerce <$> - ((findChildByName ns "w" "name" el >>= findAttrTextByName ns "w" "val") - <|> findAttrTextByName ns "w" "styleId" el) + ((findChildByName ns "w" "name" el >>= findAttrByName ns "w" "val") + <|> findAttrByName ns "w" "styleId" el) getNumInfo :: NameSpaces -> Element -> Maybe (T.Text, T.Text) getNumInfo ns element = do @@ -294,15 +301,15 @@ getNumInfo ns element = do findChildByName ns "w" "numPr" lvl = fromMaybe "0" (numPr >>= findChildByName ns "w" "ilvl" >>= - findAttrTextByName ns "w" "val") + findAttrByName ns "w" "val") numId <- numPr >>= findChildByName ns "w" "numId" >>= - findAttrTextByName ns "w" "val" + findAttrByName ns "w" "val" return (numId, lvl) elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> Maybe ParStyle elemToParStyleData ns element parentStyle - | Just styleId <- findAttrTextByName ns "w" "styleId" element + | Just styleId <- findAttrByName ns "w" "styleId" element , Just styleName <- getElementStyleName ns element = Just $ ParStyle { diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index a573344ff..970697a2d 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -1,7 +1,8 @@ +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Docx.StyleMaps Copyright : © 2014-2020 Jesse Rosenthal <jrosenthal@jhu.edu>, - 2014-2020 John MacFarlane <jgm@berkeley.edu>, + 2014-2021 John MacFarlane <jgm@berkeley.edu>, 2015 Nikolay Yakimov <root@livid.pp.ru> License : GNU GPL, version 2 or above @@ -18,51 +19,52 @@ module Text.Pandoc.Readers.Docx.Util ( , elemToNameSpaces , findChildByName , findChildrenByName - , findAttrText + , findElementByName , findAttrByName - , findAttrTextByName ) where -import Data.Maybe (mapMaybe) import qualified Data.Text as T -import Text.XML.Light +import Data.Text (Text) +import Text.Pandoc.XML.Light +import qualified Data.Map as M -type NameSpaces = [(String, String)] +type NameSpaces = M.Map Text Text elemToNameSpaces :: Element -> NameSpaces -elemToNameSpaces = mapMaybe attrToNSPair . elAttribs +elemToNameSpaces = foldr (\(Attr qn val) -> + case qn of + QName s _ (Just "xmlns") -> M.insert s val + _ -> id) mempty . elAttribs -attrToNSPair :: Attr -> Maybe (String, String) -attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) -attrToNSPair _ = Nothing - -elemName :: NameSpaces -> String -> String -> QName +elemName :: NameSpaces -> Text -> Text -> QName elemName ns prefix name = - QName name (lookup prefix ns) (if null prefix then Nothing else Just prefix) + QName name (M.lookup prefix ns) + (if T.null prefix then Nothing else Just prefix) -isElem :: NameSpaces -> String -> String -> Element -> Bool +isElem :: NameSpaces -> Text -> Text -> Element -> Bool isElem ns prefix name element = - let ns' = ns ++ elemToNameSpaces element + let ns' = ns <> elemToNameSpaces element in qName (elName element) == name && - qURI (elName element) == lookup prefix ns' + qURI (elName element) == M.lookup prefix ns' -findChildByName :: NameSpaces -> String -> String -> Element -> Maybe Element +findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element findChildByName ns pref name el = - let ns' = ns ++ elemToNameSpaces el + let ns' = ns <> elemToNameSpaces el in findChild (elemName ns' pref name) el -findChildrenByName :: NameSpaces -> String -> String -> Element -> [Element] +findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element] findChildrenByName ns pref name el = - let ns' = ns ++ elemToNameSpaces el + let ns' = ns <> elemToNameSpaces el in findChildren (elemName ns' pref name) el -findAttrText :: QName -> Element -> Maybe T.Text -findAttrText x = fmap T.pack . findAttr x +-- | Like 'findChildrenByName', but searches descendants. +findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element +findElementByName ns pref name el = + let ns' = ns <> elemToNameSpaces el + in findElement (elemName ns' pref name) el -findAttrByName :: NameSpaces -> String -> String -> Element -> Maybe String +findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text findAttrByName ns pref name el = - let ns' = ns ++ elemToNameSpaces el + let ns' = ns <> elemToNameSpaces el in findAttr (elemName ns' pref name) el -findAttrTextByName :: NameSpaces -> String -> String -> Element -> Maybe T.Text -findAttrTextByName a b c = fmap T.pack . findAttrByName a b c diff --git a/src/Text/Pandoc/Readers/DokuWiki.hs b/src/Text/Pandoc/Readers/DokuWiki.hs index 336be09e5..db98ac8de 100644 --- a/src/Text/Pandoc/Readers/DokuWiki.hs +++ b/src/Text/Pandoc/Readers/DokuWiki.hs @@ -29,26 +29,27 @@ import Text.Pandoc.Definition import Text.Pandoc.Error (PandocError (PandocParsecError)) import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (enclosed, nested) -import Text.Pandoc.Shared (crFilter, trim, stringify, tshow) +import Text.Pandoc.Shared (trim, stringify, tshow) -- | Read DokuWiki from an input string and return a Pandoc document. -readDokuWiki :: PandocMonad m +readDokuWiki :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc readDokuWiki opts s = do - let input = crFilter s - res <- runParserT parseDokuWiki def {stateOptions = opts } "source" input + let sources = toSources s + res <- runParserT parseDokuWiki def {stateOptions = opts } + (initialSourceName sources) sources case res of - Left e -> throwError $ PandocParsecError input e + Left e -> throwError $ PandocParsecError sources e Right d -> return d -type DWParser = ParserT Text ParserState +type DWParser = ParserT Sources ParserState -- * Utility functions -- | Parse end-of-line, which can be either a newline or end-of-file. -eol :: Stream s m Char => ParserT s st m () +eol :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m () eol = void newline <|> eof nested :: PandocMonad m => DWParser m a -> DWParser m a @@ -317,7 +318,7 @@ interwikiToUrl "wpes" page = "https://es.wikipedia.org/wiki/" <> page interwikiToUrl "wpfr" page = "https://fr.wikipedia.org/wiki/" <> page interwikiToUrl "wpjp" page = "https://jp.wikipedia.org/wiki/" <> page interwikiToUrl "wppl" page = "https://pl.wikipedia.org/wiki/" <> page -interwikiToUrl _ page = "https://www.google.com/search?q=" <> page <> "&btnI=lucky" +interwikiToUrl unknown page = unknown <> ">" <> page linkText :: PandocMonad m => DWParser m B.Inlines linkText = parseLink fromRaw "[[" "]]" diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 5e3326e6d..eb8d2405d 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -17,14 +17,14 @@ module Text.Pandoc.Readers.EPUB (readEPUB) where -import Codec.Archive.Zip (Archive (..), Entry, findEntryByPath, fromEntry, +import Codec.Archive.Zip (Archive (..), Entry(..), findEntryByPath, fromEntry, toArchiveOrFail) import Control.DeepSeq (NFData, deepseq) import Control.Monad (guard, liftM, liftM2, mplus) import Control.Monad.Except (throwError) import qualified Data.ByteString.Lazy as BL (ByteString) -import Data.List (isInfixOf) import qualified Data.Text as T +import Data.Text (Text) import qualified Data.Map as M (Map, elems, fromList, lookup) import Data.Maybe (mapMaybe) import qualified Data.Text.Lazy as TL @@ -40,12 +40,12 @@ import Text.Pandoc.Extensions (Extension (Ext_raw_html), enableExtension) import Text.Pandoc.MIME (MimeType) import Text.Pandoc.Options (ReaderOptions (..)) import Text.Pandoc.Readers.HTML (readHtml) -import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI) -import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) +import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI, tshow) +import qualified Text.Pandoc.UTF8 as UTF8 (toTextLazy) import Text.Pandoc.Walk (query, walk) -import Text.XML.Light +import Text.Pandoc.XML.Light -type Items = M.Map String (FilePath, MimeType) +type Items = M.Map Text (FilePath, MimeType) readEPUB :: PandocMonad m => ReaderOptions -> BL.ByteString -> m Pandoc readEPUB opts bytes = case toArchiveOrFail bytes of @@ -125,26 +125,27 @@ imageToPandoc s = B.doc . B.para $ B.image (T.pack s) "" mempty imageMimes :: [MimeType] imageMimes = ["image/gif", "image/jpeg", "image/png"] -type CoverId = String +type CoverId = Text type CoverImage = FilePath -parseManifest :: (PandocMonad m) => Element -> Maybe CoverId -> m (Maybe CoverImage, Items) +parseManifest :: (PandocMonad m) + => Element -> Maybe CoverId -> m (Maybe CoverImage, Items) parseManifest content coverId = do manifest <- findElementE (dfName "manifest") content let items = findChildren (dfName "item") manifest r <- mapM parseItem items let cover = findAttr (emptyName "href") =<< filterChild findCover manifest - return (cover `mplus` coverId, M.fromList r) + return (T.unpack <$> (cover `mplus` coverId), M.fromList r) where - findCover e = maybe False (isInfixOf "cover-image") + findCover e = maybe False (T.isInfixOf "cover-image") (findAttr (emptyName "properties") e) || Just True == liftM2 (==) coverId (findAttr (emptyName "id") e) parseItem e = do uid <- findAttrE (emptyName "id") e href <- findAttrE (emptyName "href") e mime <- findAttrE (emptyName "media-type") e - return (uid, (href, T.pack mime)) + return (uid, (T.unpack href, mime)) parseSpine :: PandocMonad m => Items -> Element -> m [(FilePath, MimeType)] parseSpine is e = do @@ -172,25 +173,25 @@ parseMeta content = do -- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem parseMetaItem :: Element -> Meta -> Meta parseMetaItem e@(stripNamespace . elName -> field) meta = - addMetaField (renameMeta field) (B.str $ T.pack $ strContent e) meta + addMetaField (renameMeta field) (B.str $ strContent e) meta -renameMeta :: String -> T.Text +renameMeta :: Text -> Text renameMeta "creator" = "author" -renameMeta s = T.pack s +renameMeta s = s getManifest :: PandocMonad m => Archive -> m (String, Element) getManifest archive = do metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive - docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry + docElem <- parseXMLDocE metaEntry let namespaces = mapMaybe attrToNSPair (elAttribs docElem) ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) as <- fmap (map attrToPair . elAttribs) (findElementE (QName "rootfile" (Just ns) Nothing) docElem) - manifestFile <- mkE "Root not found" (lookup "full-path" as) + manifestFile <- T.unpack <$> mkE "Root not found" (lookup "full-path" as) let rootdir = dropFileName manifestFile --mime <- lookup "media-type" as manifest <- findEntryByPathE manifestFile archive - (rootdir,) <$> (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) + (rootdir,) <$> parseXMLDocE manifest -- Fixup @@ -200,7 +201,8 @@ fixInternalReferences pathToFile = . walk (fixBlockIRs filename) . walk (fixInlineIRs filename) where - (root, T.unpack . escapeURI . T.pack -> filename) = splitFileName pathToFile + (root, T.unpack . escapeURI . T.pack -> filename) = + splitFileName pathToFile fixInlineIRs :: String -> Inline -> Inline fixInlineIRs s (Span as v) = @@ -213,7 +215,7 @@ fixInlineIRs s (Link as is t) = Link (fixAttrs s as) is t fixInlineIRs _ v = v -prependHash :: [T.Text] -> Inline -> Inline +prependHash :: [Text] -> Inline -> Inline prependHash ps l@(Link attr is (url, tit)) | or [s `T.isPrefixOf` url | s <- ps] = Link attr is ("#" <> url, tit) @@ -230,16 +232,17 @@ fixBlockIRs s (CodeBlock as code) = fixBlockIRs _ b = b fixAttrs :: FilePath -> B.Attr -> B.Attr -fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . T.null) cs, removeEPUBAttrs kvs) +fixAttrs s (ident, cs, kvs) = + (addHash s ident, filter (not . T.null) cs, removeEPUBAttrs kvs) -addHash :: String -> T.Text -> T.Text +addHash :: FilePath -> Text -> Text addHash _ "" = "" addHash s ident = T.pack (takeFileName s) <> "#" <> ident -removeEPUBAttrs :: [(T.Text, T.Text)] -> [(T.Text, T.Text)] +removeEPUBAttrs :: [(Text, Text)] -> [(Text, Text)] removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs -isEPUBAttr :: (T.Text, a) -> Bool +isEPUBAttr :: (Text, a) -> Bool isEPUBAttr (k, _) = "epub:" `T.isPrefixOf` k -- Library @@ -256,39 +259,44 @@ uncurry3 f (a, b, c) = f a b c -- Utility -stripNamespace :: QName -> String +stripNamespace :: QName -> Text stripNamespace (QName v _ _) = v -attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair :: Attr -> Maybe (Text, Text) attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val) attrToNSPair _ = Nothing -attrToPair :: Attr -> (String, String) +attrToPair :: Attr -> (Text, Text) attrToPair (Attr (QName name _ _) val) = (name, val) -defaultNameSpace :: Maybe String +defaultNameSpace :: Maybe Text defaultNameSpace = Just "http://www.idpf.org/2007/opf" -dfName :: String -> QName +dfName :: Text -> QName dfName s = QName s defaultNameSpace Nothing -emptyName :: String -> QName +emptyName :: Text -> QName emptyName s = QName s Nothing Nothing -- Convert Maybe interface to Either -findAttrE :: PandocMonad m => QName -> Element -> m String +findAttrE :: PandocMonad m => QName -> Element -> m Text findAttrE q e = mkE "findAttr" $ findAttr q e findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry findEntryByPathE (normalise . unEscapeString -> path) a = - mkE ("No entry on path: " ++ path) $ findEntryByPath path a + mkE ("No entry on path: " <> T.pack path) $ findEntryByPath path a -parseXMLDocE :: PandocMonad m => String -> m Element -parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc +parseXMLDocE :: PandocMonad m => Entry -> m Element +parseXMLDocE entry = + either (throwError . PandocXMLError fp) return $ parseXMLElement doc + where + doc = UTF8.toTextLazy . fromEntry $ entry + fp = T.pack $ eRelativePath entry findElementE :: PandocMonad m => QName -> Element -> m Element -findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x +findElementE e x = + mkE ("Unable to find element: " <> tshow e) $ findElement e x -mkE :: PandocMonad m => String -> Maybe a -> m a -mkE s = maybe (throwError . PandocParseError $ T.pack s) return +mkE :: PandocMonad m => Text -> Maybe a -> m a +mkE s = maybe (throwError . PandocParseError $ s) return diff --git a/src/Text/Pandoc/Readers/FB2.hs b/src/Text/Pandoc/Readers/FB2.hs index b0d2f092b..84e5278db 100644 --- a/src/Text/Pandoc/Readers/FB2.hs +++ b/src/Text/Pandoc/Readers/FB2.hs @@ -25,13 +25,13 @@ TODO: module Text.Pandoc.Readers.FB2 ( readFB2 ) where import Control.Monad.Except (throwError) import Control.Monad.State.Strict -import Data.ByteString.Lazy.Char8 ( pack ) import Data.ByteString.Base64.Lazy import Data.Functor import Data.List (intersperse) import qualified Data.Map as M import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL import Data.Default import Data.Maybe import Text.HTML.TagSoup.Entity (lookupEntity) @@ -40,8 +40,9 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad, insertMedia, report) import Text.Pandoc.Error import Text.Pandoc.Logging import Text.Pandoc.Options -import Text.Pandoc.Shared (crFilter) -import Text.XML.Light +import Text.Pandoc.XML.Light +import qualified Text.Pandoc.UTF8 as UTF8 +import Text.Pandoc.Sources (ToSources(..), sourcesToText) type FB2 m = StateT FB2State m @@ -62,12 +63,15 @@ instance HasMeta FB2State where setMeta field v s = s {fb2Meta = setMeta field v (fb2Meta s)} deleteMeta field s = s {fb2Meta = deleteMeta field (fb2Meta s)} -readFB2 :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readFB2 :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc readFB2 _ inp = - case parseXMLDoc $ crFilter inp of - Nothing -> throwError $ PandocParseError "Not an XML document" - Just e -> do - (bs, st) <- runStateT (parseRootElement e) def + case parseXMLElement $ TL.fromStrict $ sourcesToText $ toSources inp of + Left msg -> throwError $ PandocXMLError "" msg + Right el -> do + (bs, st) <- runStateT (parseRootElement el) def let authors = if null $ fb2Authors st then id else setMeta "author" (map text $ reverse $ fb2Authors st) @@ -83,12 +87,12 @@ removeHash t = case T.uncons t of Just ('#', xs) -> xs _ -> t -convertEntity :: String -> Text -convertEntity e = maybe (T.toUpper $ T.pack e) T.pack $ lookupEntity e +convertEntity :: Text -> Text +convertEntity e = maybe (T.toUpper e) T.pack $ lookupEntity (T.unpack e) parseInline :: PandocMonad m => Content -> FB2 m Inlines parseInline (Elem e) = - case T.pack $ qName $ elName e of + case qName $ elName e of "strong" -> strong <$> parseStyleType e "emphasis" -> emph <$> parseStyleType e "style" -> parseNamedStyle e @@ -96,12 +100,12 @@ parseInline (Elem e) = "strikethrough" -> strikeout <$> parseStyleType e "sub" -> subscript <$> parseStyleType e "sup" -> superscript <$> parseStyleType e - "code" -> pure $ code $ T.pack $ strContent e + "code" -> pure $ code $ strContent e "image" -> parseInlineImageElement e name -> do report $ IgnoredElement name pure mempty -parseInline (Text x) = pure $ text $ T.pack $ cdData x +parseInline (Text x) = pure $ text $ cdData x parseInline (CRef r) = pure $ str $ convertEntity r parseSubtitle :: PandocMonad m => Element -> FB2 m Blocks @@ -111,7 +115,7 @@ parseSubtitle e = headerWith ("", ["unnumbered"], []) <$> gets fb2SectionLevel < parseRootElement :: PandocMonad m => Element -> FB2 m Blocks parseRootElement e = - case T.pack $ qName $ elName e of + case qName $ elName e of "FictionBook" -> do -- Parse notes before parsing the rest of the content. case filterChild isNotesBody e of @@ -144,7 +148,7 @@ parseNote e = Just sectionId -> do content <- mconcat <$> mapM parseSectionChild (dropTitle $ elChildren e) oldNotes <- gets fb2Notes - modify $ \s -> s { fb2Notes = M.insert ("#" <> T.pack sectionId) content oldNotes } + modify $ \s -> s { fb2Notes = M.insert ("#" <> sectionId) content oldNotes } pure () where isTitle x = qName (elName x) == "title" @@ -156,7 +160,7 @@ parseNote e = -- | Parse a child of @\<FictionBook>@ element. parseFictionBookChild :: PandocMonad m => Element -> FB2 m Blocks parseFictionBookChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "stylesheet" -> pure mempty -- stylesheet is ignored "description" -> mempty <$ mapM_ parseDescriptionChild (elChildren e) "body" -> if isNotesBody e @@ -168,7 +172,7 @@ parseFictionBookChild e = -- | Parse a child of @\<description>@ element. parseDescriptionChild :: PandocMonad m => Element -> FB2 m () parseDescriptionChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title-info" -> mapM_ parseTitleInfoChild (elChildren e) "src-title-info" -> pure () -- ignore "document-info" -> pure () @@ -182,7 +186,7 @@ parseDescriptionChild e = -- | Parse a child of @\<body>@ element. parseBodyChild :: PandocMonad m => Element -> FB2 m Blocks parseBodyChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "image" -> parseImageElement e "title" -> header <$> gets fb2SectionLevel <*> parseTitleType (elContent e) "epigraph" -> parseEpigraph e @@ -196,7 +200,10 @@ parseBinaryElement e = (Nothing, _) -> report $ IgnoredElement "binary without id attribute" (Just _, Nothing) -> report $ IgnoredElement "binary without content-type attribute" - (Just filename, contentType) -> insertMedia filename (T.pack <$> contentType) (decodeLenient (pack (strContent e))) + (Just filename, contentType) -> + insertMedia (T.unpack filename) contentType + (decodeLenient + (UTF8.fromTextLazy . TL.fromStrict . strContent $ e)) -- * Type parsers @@ -206,13 +213,13 @@ parseAuthor e = T.unwords . catMaybes <$> mapM parseAuthorChild (elChildren e) parseAuthorChild :: PandocMonad m => Element -> FB2 m (Maybe Text) parseAuthorChild e = - case T.pack $ qName $ elName e of - "first-name" -> pure $ Just $ T.pack $ strContent e - "middle-name" -> pure $ Just $ T.pack $ strContent e - "last-name" -> pure $ Just $ T.pack $ strContent e - "nickname" -> pure $ Just $ T.pack $ strContent e - "home-page" -> pure $ Just $ T.pack $ strContent e - "email" -> pure $ Just $ T.pack $ strContent e + case qName $ elName e of + "first-name" -> pure $ Just $ strContent e + "middle-name" -> pure $ Just $ strContent e + "last-name" -> pure $ Just $ strContent e + "nickname" -> pure $ Just $ strContent e + "home-page" -> pure $ Just $ strContent e + "email" -> pure $ Just $ strContent e name -> do report $ IgnoredElement $ name <> " in author" pure Nothing @@ -236,13 +243,13 @@ parseTitleContent _ = pure Nothing parseImageElement :: PandocMonad m => Element -> FB2 m Blocks parseImageElement e = case href of - Just src -> pure $ para $ imageWith (imgId, [], []) (removeHash $ T.pack src) title alt + Just src -> pure $ para $ imageWith (imgId, [], []) (removeHash src) title alt Nothing -> do report $ IgnoredElement " image without href" pure mempty - where alt = maybe mempty (str . T.pack) $ findAttr (unqual "alt") e - title = maybe "" T.pack $ findAttr (unqual "title") e - imgId = maybe "" T.pack $ findAttr (unqual "id") e + where alt = maybe mempty str $ findAttr (unqual "alt") e + title = fromMaybe "" $ findAttr (unqual "title") e + imgId = fromMaybe "" $ findAttr (unqual "id") e href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e -- | Parse @pType@ @@ -256,7 +263,7 @@ parseCite e = blockQuote . mconcat <$> mapM parseCiteChild (elChildren e) -- | Parse @citeType@ child parseCiteChild :: PandocMonad m => Element -> FB2 m Blocks parseCiteChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "empty-line" -> pure horizontalRule @@ -271,13 +278,13 @@ parsePoem e = mconcat <$> mapM parsePoemChild (elChildren e) parsePoemChild :: PandocMonad m => Element -> FB2 m Blocks parsePoemChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title" -> parseTitle e "subtitle" -> parseSubtitle e "epigraph" -> parseEpigraph e "stanza" -> parseStanza e "text-author" -> para <$> parsePType e - "date" -> pure $ para $ text $ T.pack $ strContent e + "date" -> pure $ para $ text $ strContent e name -> report (UnexpectedXmlElement name "poem") $> mempty parseStanza :: PandocMonad m => Element -> FB2 m Blocks @@ -290,7 +297,7 @@ joinLineBlocks [] = [] parseStanzaChild :: PandocMonad m => Element -> FB2 m Blocks parseStanzaChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title" -> parseTitle e "subtitle" -> parseSubtitle e "v" -> lineBlock . (:[]) <$> parsePType e @@ -300,11 +307,11 @@ parseStanzaChild e = parseEpigraph :: PandocMonad m => Element -> FB2 m Blocks parseEpigraph e = divWith (divId, ["epigraph"], []) . mconcat <$> mapM parseEpigraphChild (elChildren e) - where divId = maybe "" T.pack $ findAttr (unqual "id") e + where divId = fromMaybe "" $ findAttr (unqual "id") e parseEpigraphChild :: PandocMonad m => Element -> FB2 m Blocks parseEpigraphChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "cite" -> parseCite e @@ -318,7 +325,7 @@ parseAnnotation e = mconcat <$> mapM parseAnnotationChild (elChildren e) parseAnnotationChild :: PandocMonad m => Element -> FB2 m Blocks parseAnnotationChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "cite" -> parseCite e @@ -332,14 +339,14 @@ parseSection :: PandocMonad m => Element -> FB2 m Blocks parseSection e = do n <- gets fb2SectionLevel modify $ \st -> st{ fb2SectionLevel = n + 1 } - let sectionId = maybe "" T.pack $ findAttr (unqual "id") e + let sectionId = fromMaybe "" $ findAttr (unqual "id") e bs <- divWith (sectionId, ["section"], []) . mconcat <$> mapM parseSectionChild (elChildren e) modify $ \st -> st{ fb2SectionLevel = n } pure bs parseSectionChild :: PandocMonad m => Element -> FB2 m Blocks parseSectionChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title" -> parseBodyChild e "epigraph" -> parseEpigraph e "image" -> parseImageElement e @@ -361,16 +368,16 @@ parseStyleType e = mconcat <$> mapM parseInline (elContent e) parseNamedStyle :: PandocMonad m => Element -> FB2 m Inlines parseNamedStyle e = do content <- mconcat <$> mapM parseNamedStyleChild (elContent e) - let lang = maybeToList $ ("lang",) . T.pack <$> findAttr (QName "lang" Nothing (Just "xml")) e + let lang = maybeToList $ ("lang",) <$> findAttr (QName "lang" Nothing (Just "xml")) e case findAttr (unqual "name") e of - Just name -> pure $ spanWith ("", [T.pack name], lang) content + Just name -> pure $ spanWith ("", [name], lang) content Nothing -> do report $ IgnoredElement "link without required name" pure mempty parseNamedStyleChild :: PandocMonad m => Content -> FB2 m Inlines parseNamedStyleChild (Elem e) = - case T.pack $ qName (elName e) of + case qName (elName e) of "strong" -> strong <$> parseStyleType e "emphasis" -> emph <$> parseStyleType e "style" -> parseNamedStyle e @@ -378,7 +385,7 @@ parseNamedStyleChild (Elem e) = "strikethrough" -> strikeout <$> parseStyleType e "sub" -> subscript <$> parseStyleType e "sup" -> superscript <$> parseStyleType e - "code" -> pure $ code $ T.pack $ strContent e + "code" -> pure $ code $ strContent e "image" -> parseInlineImageElement e name -> do report $ IgnoredElement $ name <> " in style" @@ -390,7 +397,7 @@ parseLinkType :: PandocMonad m => Element -> FB2 m Inlines parseLinkType e = do content <- mconcat <$> mapM parseStyleLinkType (elContent e) notes <- gets fb2Notes - case T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of + case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of Just href -> case findAttr (QName "type" Nothing Nothing) e of Just "note" -> case M.lookup href notes of Nothing -> pure $ link href "" content @@ -417,15 +424,14 @@ parseTable _ = pure mempty -- TODO: tables are not supported yet -- | Parse @title-infoType@ parseTitleInfoChild :: PandocMonad m => Element -> FB2 m () parseTitleInfoChild e = - case T.pack $ qName (elName e) of + case qName (elName e) of "genre" -> pure () "author" -> parseAuthor e >>= \author -> modify (\st -> st {fb2Authors = author:fb2Authors st}) - "book-title" -> modify (setMeta "title" (text $ T.pack $ strContent e)) + "book-title" -> modify (setMeta "title" (text $ strContent e)) "annotation" -> parseAnnotation e >>= modify . setMeta "abstract" "keywords" -> modify (setMeta "keywords" (map (MetaString . trim) $ T.splitOn "," - $ T.pack $ strContent e)) - "date" -> modify (setMeta "date" (text $ T.pack $ strContent e)) + "date" -> modify (setMeta "date" (text $ strContent e)) "coverpage" -> parseCoverPage e "lang" -> pure () "src-lang" -> pure () @@ -439,7 +445,7 @@ parseCoverPage e = Just img -> case href of Just src -> modify (setMeta "cover-image" (MetaString $ removeHash src)) Nothing -> pure () - where href = T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) img + where href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) img Nothing -> pure () -- | Parse @inlineImageType@ element @@ -452,5 +458,5 @@ parseInlineImageElement e = Nothing -> do report $ IgnoredElement "inline image without href" pure mempty - where alt = maybe mempty (str . T.pack) $ findAttr (unqual "alt") e - href = T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e + where alt = maybe mempty str $ findAttr (unqual "alt") e + href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index eb78979a3..fdf4f28e0 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -5,7 +5,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -19,21 +19,20 @@ module Text.Pandoc.Readers.HTML ( readHtml , htmlInBalanced , isInlineTag , isBlockTag - , NamedTag(..) , isTextTag , isCommentTag ) where import Control.Applicative ((<|>)) -import Control.Arrow (first) import Control.Monad (guard, msum, mzero, unless, void) -import Control.Monad.Except (throwError) +import Control.Monad.Except (throwError, catchError) import Control.Monad.Reader (ask, asks, lift, local, runReaderT) import Data.ByteString.Base64 (encode) import Data.Char (isAlphaNum, isLetter) import Data.Default (Default (..), def) import Data.Foldable (for_) import Data.List.Split (splitWhen) +import Data.List (foldl') import qualified Data.Map as M import Data.Maybe (fromMaybe, isJust, isNothing) import Data.Monoid (First (..)) @@ -62,21 +61,22 @@ import Text.Pandoc.Options ( ReaderOptions (readerExtensions, readerStripComments), extensionEnabled) import Text.Pandoc.Parsing hiding ((<|>)) -import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI, - extractSpaces, htmlSpanLikeElements, safeRead, tshow) +import Text.Pandoc.Shared ( + addMetaField, blocksToInlines', escapeURI, extractSpaces, + htmlSpanLikeElements, renderTags', safeRead, tshow) import Text.Pandoc.Walk import Text.Parsec.Error import Text.TeXMath (readMathML, writeTeX) -- | Convert HTML-formatted string to 'Pandoc' document. -readHtml :: PandocMonad m +readHtml :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assumes @'\n'@ line endings) + -> a -- ^ Input to parse -> m Pandoc readHtml opts inp = do - let tags = stripPrefixes . canonicalizeTags $ + let tags = stripPrefixes $ canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } - (crFilter inp) + (sourcesToText $ toSources inp) parseDoc = do blocks <- fixPlains False . mconcat <$> manyTill block eof meta <- stateMeta . parserState <$> getState @@ -95,6 +95,15 @@ readHtml opts inp = do Right doc -> return doc Left err -> throwError $ PandocParseError $ T.pack $ getError err +-- Strip namespace prefixes on tags (not attributes) +stripPrefixes :: [Tag Text] -> [Tag Text] +stripPrefixes = map stripPrefix + +stripPrefix :: Tag Text -> Tag Text +stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as +stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s) +stripPrefix x = x + replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes bs = do st <- getState @@ -112,14 +121,18 @@ setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInPlain = local (\s -> s {inPlain = True}) pHtml :: PandocMonad m => TagParser m Blocks -pHtml = try $ do +pHtml = do (TagOpen "html" attr) <- lookAhead pAny - for_ (lookup "lang" attr) $ + for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $ updateState . B.setMeta "lang" . B.text pInTags "html" block pBody :: PandocMonad m => TagParser m Blocks -pBody = pInTags "body" block +pBody = do + (TagOpen "body" attr) <- lookAhead pAny + for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $ + updateState . B.setMeta "lang" . B.text + pInTags "body" block pHead :: PandocMonad m => TagParser m Blocks pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAny) @@ -145,32 +158,65 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAny) return mempty block :: PandocMonad m => TagParser m Blocks -block = do - res <- choice - [ eSection - , eSwitch B.para block - , mempty <$ eFootnote - , mempty <$ eTOC - , mempty <$ eTitlePage - , pPara - , pHeader - , pBlockQuote - , pCodeBlock - , pList - , pHrule - , pTable block - , pHtml - , pHead - , pBody - , pLineBlock - , pDiv - , pPlain - , pFigure - , pIframe - , pRawHtmlBlock - ] - trace (T.take 60 $ tshow $ B.toList res) - return res +block = ((do + tag <- lookAhead (pSatisfy isBlockTag) + exts <- getOption readerExtensions + case tag of + TagOpen name attr -> + let type' = fromMaybe "" $ + lookup "type" attr <|> lookup "epub:type" attr + epubExts = extensionEnabled Ext_epub_html_exts exts + in + case name of + _ | name `elem` sectioningContent + , epubExts + , "chapter" `T.isInfixOf` type' + -> eSection + _ | epubExts + , type' `elem` ["footnote", "rearnote"] + -> mempty <$ eFootnote + _ | epubExts + , type' == "toc" + -> mempty <$ eTOC + _ | "titlepage" `T.isInfixOf` type' + , name `elem` ("section" : groupingContent) + -> mempty <$ eTitlePage + "p" -> pPara + "h1" -> pHeader + "h2" -> pHeader + "h3" -> pHeader + "h4" -> pHeader + "h5" -> pHeader + "h6" -> pHeader + "blockquote" -> pBlockQuote + "pre" -> pCodeBlock + "ul" -> pBulletList + "ol" -> pOrderedList + "dl" -> pDefinitionList + "table" -> pTable block + "hr" -> pHrule + "html" -> pHtml + "head" -> pHead + "body" -> pBody + "div" + | extensionEnabled Ext_line_blocks exts + , Just "line-block" <- lookup "class" attr + -> pLineBlock + | otherwise + -> pDiv + "section" -> pDiv + "header" -> pDiv + "main" -> pDiv + "figure" -> pFigure + "iframe" -> pIframe + "style" -> pRawHtmlBlock + "textarea" -> pRawHtmlBlock + "switch" + | epubExts + -> eSwitch B.para block + _ -> mzero + _ -> mzero) <|> pPlain <|> pRawHtmlBlock) >>= \res -> + res <$ trace (T.take 60 $ tshow $ B.toList res) namespaces :: PandocMonad m => [(Text, TagParser m Inlines)] namespaces = [(mathMLNamespace, pMath True)] @@ -243,9 +289,6 @@ eTOC = try $ do guard $ (lookup "type" attr <|> lookup "epub:type" attr) == Just "toc" void (pInTags tag block) -pList :: PandocMonad m => TagParser m Blocks -pList = pBulletList <|> pOrderedList <|> pDefinitionList - pBulletList :: PandocMonad m => TagParser m Blocks pBulletList = try $ do pSatisfy (matchTagOpen "ul" []) @@ -319,7 +362,10 @@ pDefListItem = try $ do terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) skipMany nonItem - let term = foldl1 (\x y -> x <> B.linebreak <> y) $ map trimInlines terms + let term = foldl' (\x y -> if null x + then trimInlines y + else x <> B.linebreak <> trimInlines y) + mempty terms return (term, map (fixPlains True) defs) fixPlains :: Bool -> Blocks -> Blocks @@ -356,13 +402,16 @@ pLineBlock = try $ do B.toList ils return $ B.lineBlock lns +isDivLike :: Text -> Bool +isDivLike "div" = True +isDivLike "section" = True +isDivLike "header" = True +isDivLike "main" = True +isDivLike _ = False + pDiv :: PandocMonad m => TagParser m Blocks pDiv = try $ do guardEnabled Ext_native_divs - let isDivLike "div" = True - isDivLike "section" = True - isDivLike "main" = True - isDivLike _ = False TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) let (ident, classes, kvs) = toAttr attr' contents <- pInTags tag block @@ -380,11 +429,17 @@ pIframe = try $ do tag <- pSatisfy (tagOpen (=="iframe") (isJust . lookup "src")) pCloses "iframe" <|> eof url <- canonicalizeUrl $ fromAttrib "src" tag - (bs, _) <- openURL url - let inp = UTF8.toText bs - opts <- readerOpts <$> getState - Pandoc _ contents <- readHtml opts inp - return $ B.divWith ("",["iframe"],[]) $ B.fromList contents + if T.null url + then ignore $ renderTags' [tag, TagClose "iframe"] + else catchError + (do (bs, _) <- openURL url + let inp = UTF8.toText bs + opts <- readerOpts <$> getState + Pandoc _ contents <- readHtml opts inp + return $ B.divWith ("",["iframe"],[]) $ B.fromList contents) + (\e -> do + logMessage $ CouldNotFetchResource url (renderError e) + ignore $ renderTags' [tag, TagClose "iframe"]) pRawHtmlBlock :: PandocMonad m => TagParser m Blocks pRawHtmlBlock = do @@ -446,17 +501,13 @@ pHeader = try $ do tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) let attr = toStringAttr attr' - let bodyTitle = TagOpen tagtype attr' ~== TagOpen ("h1" :: Text) - [("class","title")] level <- headerLevel tagtype contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] T.words $ lookup "class" attr let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] attr'' <- registerHeader (ident, classes, keyvals) contents - return $ if bodyTitle - then mempty -- skip a representation of the title in the body - else B.headerWith attr'' level contents + return $ B.headerWith attr'' level contents pHrule :: PandocMonad m => TagParser m Blocks pHrule = do @@ -506,7 +557,18 @@ pFigure = try $ do pCodeBlock :: PandocMonad m => TagParser m Blocks pCodeBlock = try $ do TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" []) - let attr = toAttr attr' + -- if the `pre` has no attributes, try if it is followed by a `code` + -- element and use those attributes if possible. + attr <- case attr' of + _:_ -> pure (toAttr attr') + [] -> option nullAttr $ do + TagOpen _ codeAttr <- pSatisfy (matchTagOpen "code" []) + pure $ toAttr + [ (k, v') | (k, v) <- codeAttr + -- strip language from class + , let v' = if k == "class" + then fromMaybe v (T.stripPrefix "language-" v) + else v ] contents <- manyTill pAny (pCloses "pre" <|> eof) let rawText = T.concat $ map tagToText contents -- drop leading newline if any @@ -525,31 +587,47 @@ tagToText (TagOpen "br" _) = "\n" tagToText _ = "" inline :: PandocMonad m => TagParser m Inlines -inline = choice - [ eNoteref - , eSwitch id inline - , pTagText - , pQ - , pEmph - , pStrong - , pSuperscript - , pSubscript - , pSpanLike - , pSmall - , pStrikeout - , pUnderline - , pLineBreak - , pLink - , pImage - , pSvg - , pBdo - , pCode - , pCodeWithClass [("samp","sample"),("var","variable")] - , pSpan - , pMath False - , pScriptMath - , pRawHtmlInline - ] +inline = pTagText <|> do + tag <- lookAhead (pSatisfy isInlineTag) + exts <- getOption readerExtensions + case tag of + TagOpen name attr -> + case name of + "a" | extensionEnabled Ext_epub_html_exts exts + , Just "noteref" <- lookup "type" attr <|> lookup "epub:type" attr + , Just ('#',_) <- lookup "href" attr >>= T.uncons + -> eNoteref + | otherwise -> pLink + "switch" -> eSwitch id inline + "q" -> pQ + "em" -> pEmph + "i" -> pEmph + "strong" -> pStrong + "b" -> pStrong + "sup" -> pSuperscript + "sub" -> pSubscript + "small" -> pSmall + "s" -> pStrikeout + "strike" -> pStrikeout + "del" -> pStrikeout + "u" -> pUnderline + "ins" -> pUnderline + "br" -> pLineBreak + "img" -> pImage + "svg" -> pSvg + "bdo" -> pBdo + "code" -> pCode + "samp" -> pCodeWithClass "samp" "sample" + "var" -> pCodeWithClass "var" "variable" + "span" -> pSpan + "math" -> pMath False + "script" + | Just x <- lookup "type" attr + , "math/tex" `T.isPrefixOf` x -> pScriptMath + _ | name `elem` htmlSpanLikeElements -> pSpanLike + _ -> pRawHtmlInline + TagText _ -> pTagText + _ -> pRawHtmlInline pSelfClosing :: PandocMonad m => (Text -> Bool) -> ([Attribute Text] -> Bool) @@ -560,27 +638,25 @@ pSelfClosing f g = do return open pQ :: PandocMonad m => TagParser m Inlines -pQ = choice $ map try [citedQuote, normalQuote] - where citedQuote = do - tag <- pSatisfy $ tagOpenLit "q" (any ((=="cite") . fst)) - - url <- canonicalizeUrl $ fromAttrib "cite" tag - let uid = fromMaybe (fromAttrib "name" tag) $ - maybeFromAttrib "id" tag - let cls = T.words $ fromAttrib "class" tag - - makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url)]) - normalQuote = do - pSatisfy $ tagOpenLit "q" (const True) - makeQuote id - makeQuote wrapper = do - ctx <- asks quoteContext - let (constructor, innerContext) = case ctx of - InDoubleQuote -> (B.singleQuoted, InSingleQuote) - _ -> (B.doubleQuoted, InDoubleQuote) - - content <- withQuoteContext innerContext (mconcat <$> manyTill inline (pCloses "q")) - return $ extractSpaces (constructor . wrapper) content +pQ = do + TagOpen _ attrs <- pSatisfy $ tagOpenLit "q" (const True) + case lookup "cite" attrs of + Just url -> do + let uid = fromMaybe mempty $ + lookup "name" attrs <> lookup "id" attrs + let cls = maybe [] T.words $ lookup "class" attrs + url' <- canonicalizeUrl url + makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url')]) + Nothing -> makeQuote id + where + makeQuote wrapper = do + ctx <- asks quoteContext + let (constructor, innerContext) = case ctx of + InDoubleQuote -> (B.singleQuoted, InSingleQuote) + _ -> (B.doubleQuoted, InDoubleQuote) + content <- withQuoteContext innerContext + (mconcat <$> manyTill inline (pCloses "q")) + return $ extractSpaces (constructor . wrapper) content pEmph :: PandocMonad m => TagParser m Inlines pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph @@ -646,17 +722,12 @@ pLink = try $ do pImage :: PandocMonad m => TagParser m Inlines pImage = do - tag <- pSelfClosing (=="img") (isJust . lookup "src") + tag@(TagOpen _ attr') <- pSelfClosing (=="img") (isJust . lookup "src") url <- canonicalizeUrl $ fromAttrib "src" tag let title = fromAttrib "title" tag let alt = fromAttrib "alt" tag - let uid = fromAttrib "id" tag - let cls = T.words $ fromAttrib "class" tag - let getAtt k = case fromAttrib k tag of - "" -> [] - v -> [(k, v)] - let kvs = concatMap getAtt ["width", "height", "sizes", "srcset"] - return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt) + let attr = toAttr $ filter (\(k,_) -> k /= "alt" && k /= "title" && k /= "src") attr' + return $ B.imageWith attr (escapeURI url) title (B.text alt) pSvg :: PandocMonad m => TagParser m Inlines pSvg = do @@ -671,13 +742,12 @@ pSvg = do UTF8.toText (encode $ UTF8.fromText rawText) return $ B.imageWith (ident,cls,[]) svgData mempty mempty -pCodeWithClass :: PandocMonad m => [(T.Text,Text)] -> TagParser m Inlines -pCodeWithClass elemToClass = try $ do - let tagTest = flip elem . fmap fst $ elemToClass - TagOpen open attr' <- pSatisfy $ tagOpen tagTest (const True) +pCodeWithClass :: PandocMonad m => Text -> Text -> TagParser m Inlines +pCodeWithClass name class' = try $ do + TagOpen open attr' <- pSatisfy $ tagOpen (== name) (const True) result <- manyTill pAny (pCloses open) let (ids,cs,kvs) = toAttr attr' - cs' = maybe cs (:cs) . lookup open $ elemToClass + cs' = class' : cs return . B.codeWith (ids,cs',kvs) . T.unwords . T.lines . innerText $ result @@ -764,17 +834,19 @@ pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline pTagText :: PandocMonad m => TagParser m Inlines pTagText = try $ do + pos <- getPosition (TagText str) <- pSatisfy isTagText st <- getState qu <- ask parsed <- lift $ lift $ - flip runReaderT qu $ runParserT (many pTagContents) st "text" str + flip runReaderT qu $ runParserT (many pTagContents) st "text" + (Sources [(pos, str)]) case parsed of Left _ -> throwError $ PandocParseError $ "Could not parse `" <> str <> "'" Right result -> return $ mconcat result -type InlinesParser m = HTMLParser m Text +type InlinesParser m = HTMLParser m Sources pTagContents :: PandocMonad m => InlinesParser m Inlines pTagContents = @@ -868,27 +940,23 @@ pSpace = many1 (satisfy isSpace) >>= \xs -> then return B.softbreak else return B.space -class NamedTag a where - getTagName :: a -> Maybe Text - -instance NamedTag (Tag Text) where - getTagName (TagOpen t _) = Just t - getTagName (TagClose t) = Just t - getTagName _ = Nothing - -instance NamedTag (Tag String) where - getTagName (TagOpen t _) = Just (T.pack t) - getTagName (TagClose t) = Just (T.pack t) - getTagName _ = Nothing - -isInlineTag :: NamedTag (Tag a) => Tag a -> Bool -isInlineTag t = - isCommentTag t || case getTagName t of - Nothing -> False - Just x -> x `Set.notMember` blockTags || - T.take 1 x == "?" -- processing instr. - -isBlockTag :: NamedTag (Tag a) => Tag a -> Bool +getTagName :: Tag Text -> Maybe Text +getTagName (TagOpen t _) = Just t +getTagName (TagClose t) = Just t +getTagName _ = Nothing + +isInlineTag :: Tag Text -> Bool +isInlineTag t = isCommentTag t || case t of + TagOpen "script" _ -> "math/tex" `T.isPrefixOf` fromAttrib "type" t + TagClose "script" -> True + TagOpen name _ -> isInlineTagName name + TagClose name -> isInlineTagName name + _ -> False + where isInlineTagName x = + x `Set.notMember` blockTags || + T.take 1 x == "?" -- processing instr. + +isBlockTag :: Tag Text -> Bool isBlockTag t = isBlockTagName || isTagComment t where isBlockTagName = case getTagName t of @@ -899,10 +967,10 @@ isBlockTag t = isBlockTagName || isTagComment t || x `Set.member` eitherBlockOrInline Nothing -> False -isTextTag :: Tag a -> Bool +isTextTag :: Tag Text -> Bool isTextTag = tagText (const True) -isCommentTag :: Tag a -> Bool +isCommentTag :: Tag Text -> Bool isCommentTag = tagComment (const True) --- parsers for use in markdown, textile readers @@ -910,13 +978,14 @@ isCommentTag = tagComment (const True) -- | Matches a stretch of HTML in balanced tags. htmlInBalanced :: Monad m => (Tag Text -> Bool) - -> ParserT Text st m Text + -> ParserT Sources st m Text htmlInBalanced f = try $ do lookAhead (char '<') - inp <- getInput - let ts = canonicalizeTags $ - parseTagsOptions parseOptions{ optTagWarning = True, - optTagPosition = True } inp + sources <- getInput + let ts = canonicalizeTags + $ parseTagsOptions parseOptions{ optTagWarning = True, + optTagPosition = True } + $ sourcesToText sources case ts of (TagPosition sr sc : t@(TagOpen tn _) : rest) -> do guard $ f t @@ -951,22 +1020,24 @@ htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts go n (t:ts') = (t :) <$> go n ts' go _ [] = mzero -hasTagWarning :: [Tag a] -> Bool +hasTagWarning :: [Tag Text] -> Bool hasTagWarning (TagWarning _:_) = True hasTagWarning _ = False -- | Matches a tag meeting a certain condition. htmlTag :: (HasReaderOptions st, Monad m) => (Tag Text -> Bool) - -> ParserT Text st m (Tag Text, Text) + -> ParserT Sources st m (Tag Text, Text) htmlTag f = try $ do lookAhead (char '<') startpos <- getPosition - inp <- getInput + sources <- getInput + let inp = sourcesToText sources let ts = canonicalizeTags $ parseTagsOptions parseOptions{ optTagWarning = False , optTagPosition = True } - (inp <> " ") -- add space to ensure that + (inp <> " ") + -- add space to ensure that -- we get a TagPosition after the tag (next, ln, col) <- case ts of (TagPosition{} : next : TagPosition ln col : _) @@ -1024,21 +1095,6 @@ htmlTag f = try $ do handleTag tagname _ -> mzero --- Strip namespace prefixes -stripPrefixes :: [Tag Text] -> [Tag Text] -stripPrefixes = map stripPrefix - -stripPrefix :: Tag Text -> Tag Text -stripPrefix (TagOpen s as) = - TagOpen (stripPrefix' s) (map (first stripPrefix') as) -stripPrefix (TagClose s) = TagClose (stripPrefix' s) -stripPrefix x = x - -stripPrefix' :: Text -> Text -stripPrefix' s = - if T.null t then s else T.drop 1 t - where (_, t) = T.span (/= ':') s - -- Utilities -- | Adjusts a url according to the document's base URL. @@ -1048,26 +1104,3 @@ canonicalizeUrl url = do return $ case (parseURIReference (T.unpack url), mbBaseHref) of (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs) _ -> url - --- For now we need a special version here; the one in Shared has String type -renderTags' :: [Tag Text] -> Text -renderTags' = renderTagsOptions - renderOptions{ optMinimize = matchTags ["hr", "br", "img", - "meta", "link"] - , optRawTag = matchTags ["script", "style"] } - where matchTags tags = flip elem tags . T.toLower - - --- EPUB Specific --- --- -{- - -types :: [(String, ([String], Int))] -types = -- Document divisions - map (\s -> (s, (["section", "body"], 0))) - ["volume", "part", "chapter", "division"] - <> -- Document section and components - [ - ("abstract", ([], 0))] --} diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs index 2d58319da..bd8d7c96c 100644 --- a/src/Text/Pandoc/Readers/HTML/Parsing.hs +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -2,7 +2,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.HTML.Parsing - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -161,10 +161,12 @@ _ `closes` "html" = False "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True "td" `closes` t | t `elem` ["th","td"] = True -"tr" `closes` t | t `elem` ["th","td","tr"] = True +"tr" `closes` t | t `elem` ["th","td","tr","colgroup"] = True "dd" `closes` t | t `elem` ["dt", "dd"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True "rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True +"col" `closes` "col" = True +"colgroup" `closes` "col" = True "optgroup" `closes` "optgroup" = True "optgroup" `closes` "option" = True "option" `closes` "option" = True @@ -193,14 +195,20 @@ t1 `closes` t2 | _ `closes` _ = False toStringAttr :: [(Text, Text)] -> [(Text, Text)] -toStringAttr = map go +toStringAttr = foldr go [] where - go (x,y) = - case T.stripPrefix "data-" x of - Just x' | x' `Set.notMember` (html5Attributes <> - html4Attributes <> rdfaAttributes) - -> (x',y) - _ -> (x,y) + go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)] + -- treat xml:lang as lang + go ("xml:lang",y) ats = go ("lang",y) ats + -- prevent duplicate attributes + go (x,y) ats + | any (\(x',_) -> x == x') ats = ats + | otherwise = + case T.stripPrefix "data-" x of + Just x' | x' `Set.notMember` (html5Attributes <> + html4Attributes <> rdfaAttributes) + -> go (x',y) ats + _ -> (x,y):ats -- Unlike fromAttrib from tagsoup, this distinguishes -- between a missing attribute and an attribute with empty content. diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs index 5a783988f..6e62e12f5 100644 --- a/src/Text/Pandoc/Readers/HTML/Table.hs +++ b/src/Text/Pandoc/Readers/HTML/Table.hs @@ -3,8 +3,8 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.HTML.Table - Copyright : © 2006-2020 John MacFarlane, - 2020 Albert Krewinkel + Copyright : © 2006-2021 John MacFarlane, + 2020-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <albert@zeitkraut.de> @@ -17,6 +17,8 @@ module Text.Pandoc.Readers.HTML.Table (pTable) where import Control.Applicative ((<|>)) import Data.Maybe (fromMaybe) +import Data.Either (lefts, rights) +import Data.List.NonEmpty (nonEmpty) import Data.Text (Text) import Text.HTML.TagSoup import Text.Pandoc.Builder (Blocks) @@ -32,34 +34,51 @@ import Text.Pandoc.Shared (onlySimpleTableCells, safeRead) import qualified Data.Text as T import qualified Text.Pandoc.Builder as B --- | Parses a @<col>@ element, returning the column's width. Defaults to --- @'ColWidthDefault'@ if the width is not set or cannot be determined. -pCol :: PandocMonad m => TagParser m ColWidth +-- | Parses a @<col>@ element, returning the column's width. +-- An Either value is used: Left i means a "relative length" with +-- integral value i (see https://www.w3.org/TR/html4/types.html#h-6.6); +-- Right w means a regular width. Defaults to @'Right ColWidthDefault'@ +-- if the width is not set or cannot be determined. +pCol :: PandocMonad m => TagParser m (Either Int ColWidth) pCol = try $ do TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) let attribs = toStringAttr attribs' skipMany pBlank optional $ pSatisfy (matchTagClose "col") skipMany pBlank - let width = case lookup "width" attribs of + return $ case lookup "width" attribs of Nothing -> case lookup "style" attribs of Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs -> - fromMaybe 0.0 $ safeRead (T.filter - (`notElem` (" \t\r\n%'\";" :: [Char])) xs) - _ -> 0.0 + maybe (Right ColWidthDefault) (Right . ColWidth . (/ 100.0)) + $ safeRead (T.filter + (`notElem` (" \t\r\n%'\";" :: [Char])) xs) + _ -> Right ColWidthDefault + Just (T.unsnoc -> Just (xs, '*')) -> + maybe (Left 1) Left $ safeRead xs Just (T.unsnoc -> Just (xs, '%')) -> - fromMaybe 0.0 $ safeRead xs - _ -> 0.0 - if width > 0.0 - then return $ ColWidth $ width / 100.0 - else return ColWidthDefault + maybe (Right ColWidthDefault) + (Right . ColWidth . (/ 100.0)) $ safeRead xs + _ -> Right ColWidthDefault -pColgroup :: PandocMonad m => TagParser m [ColWidth] +pColgroup :: PandocMonad m => TagParser m [Either Int ColWidth] pColgroup = try $ do pSatisfy (matchTagOpen "colgroup" []) skipMany pBlank manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank +resolveRelativeLengths :: [Either Int ColWidth] -> [ColWidth] +resolveRelativeLengths ws = + let remaining = 1 - sum (map getColWidth $ rights ws) + relatives = sum $ lefts ws + relUnit = remaining / fromIntegral relatives + toColWidth (Right x) = x + toColWidth (Left i) = ColWidth (fromIntegral i * relUnit) + in map toColWidth ws + +getColWidth :: ColWidth -> Double +getColWidth ColWidthDefault = 0 +getColWidth (ColWidth w) = w + data CellType = HeaderCell | BodyCell @@ -181,7 +200,8 @@ pTable :: PandocMonad m pTable block = try $ do TagOpen _ attribs <- pSatisfy (matchTagOpen "table" []) <* skipMany pBlank caption <- option mempty $ pInTags "caption" block <* skipMany pBlank - widths <- ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank + widths <- resolveRelativeLengths <$> + ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank thead <- pTableHead block <* skipMany pBlank topfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank tbodies <- many (pTableBody block) <* skipMany pBlank @@ -214,8 +234,9 @@ normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot -> Either String ([ColSpec], TableHead, [TableBody], TableFoot) normalize widths head' bodies foot = do let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot - let rowLength = length . rowCells - let ncols = maximum (map rowLength rows) + let cellWidth (Cell _ _ _ (ColSpan cs) _) = cs + let rowLength = foldr (\cell acc -> cellWidth cell + acc) 0 . rowCells + let ncols = maybe 0 maximum $ nonEmpty $ map rowLength rows let tblType = tableType (map rowCells rows) -- fail on empty table if null rows diff --git a/src/Text/Pandoc/Readers/HTML/TagCategories.hs b/src/Text/Pandoc/Readers/HTML/TagCategories.hs index 4f82a1831..b7bd40fee 100644 --- a/src/Text/Pandoc/Readers/HTML/TagCategories.hs +++ b/src/Text/Pandoc/Readers/HTML/TagCategories.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.HTML.TagCategories - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Readers/HTML/Types.hs b/src/Text/Pandoc/Readers/HTML/Types.hs index a94eeb828..12c519ad6 100644 --- a/src/Text/Pandoc/Readers/HTML/Types.hs +++ b/src/Text/Pandoc/Readers/HTML/Types.hs @@ -2,7 +2,7 @@ {-# LANGUAGE MultiParamTypeClasses #-} {- | Module : Text.Pandoc.Readers.HTML.Types - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Readers/Haddock.hs b/src/Text/Pandoc/Readers/Haddock.hs index 25d69f040..67b3af2d3 100644 --- a/src/Text/Pandoc/Readers/Haddock.hs +++ b/src/Text/Pandoc/Readers/Haddock.hs @@ -17,8 +17,9 @@ module Text.Pandoc.Readers.Haddock import Control.Monad.Except (throwError) import Data.List (intersperse) +import Data.List.NonEmpty (nonEmpty) import Data.Maybe (fromMaybe) -import Data.Text (Text, unpack) +import Data.Text (unpack) import qualified Data.Text as T import Documentation.Haddock.Parser import Documentation.Haddock.Types as H @@ -28,15 +29,17 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Definition import Text.Pandoc.Error import Text.Pandoc.Options -import Text.Pandoc.Shared (crFilter, splitTextBy, trim) +import Text.Pandoc.Sources (ToSources(..), sourcesToText) +import Text.Pandoc.Shared (splitTextBy, trim) -- | Parse Haddock markup and return a 'Pandoc' document. -readHaddock :: PandocMonad m +readHaddock :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc -readHaddock opts s = case readHaddockEither opts (unpack (crFilter s)) of +readHaddock opts s = case readHaddockEither opts + (unpack . sourcesToText . toSources $ s) of Right result -> return result Left e -> throwError e @@ -92,7 +95,7 @@ docHToBlocks d' = then ([], map toCells bodyRows) else (toCells (head headerRows), map toCells (tail headerRows ++ bodyRows)) - colspecs = replicate (maximum (map length body)) + colspecs = replicate (maybe 0 maximum (nonEmpty (map length body))) (AlignDefault, ColWidthDefault) in B.table B.emptyCaption colspecs @@ -128,7 +131,8 @@ docHToInlines isCode d' = DocIdentifier s -> B.codeWith ("",["haskell","identifier"],[]) $ T.pack s _ -> mempty DocIdentifierUnchecked s -> B.codeWith ("",["haskell","identifier"],[]) $ T.pack s - DocModule s -> B.codeWith ("",["haskell","module"],[]) $ T.pack s + DocModule s -> B.codeWith ("",["haskell","module"],[]) $ + T.pack (modLinkName s) DocWarning _ -> mempty -- TODO DocEmphasis d -> B.emph (docHToInlines isCode d) DocMonospaced (DocString s) -> B.code $ T.pack s diff --git a/src/Text/Pandoc/Readers/Ipynb.hs b/src/Text/Pandoc/Readers/Ipynb.hs index a866e6ec3..cd1093109 100644 --- a/src/Text/Pandoc/Readers/Ipynb.hs +++ b/src/Text/Pandoc/Readers/Ipynb.hs @@ -4,7 +4,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Readers.Ipynb - Copyright : Copyright (C) 2019-2020 John MacFarlane + Copyright : Copyright (C) 2019-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -39,10 +39,12 @@ import Data.Aeson as Aeson import Control.Monad.Except (throwError) import Text.Pandoc.Readers.Markdown (readMarkdown) import qualified Text.Pandoc.UTF8 as UTF8 +import Text.Pandoc.Sources (ToSources(..), sourcesToText) -readIpynb :: PandocMonad m => ReaderOptions -> Text -> m Pandoc -readIpynb opts t = do - let src = BL.fromStrict (TE.encodeUtf8 t) +readIpynb :: (PandocMonad m, ToSources a) + => ReaderOptions -> a -> m Pandoc +readIpynb opts x = do + let src = BL.fromStrict . TE.encodeUtf8 . sourcesToText $ toSources x case eitherDecode src of Right (notebook4 :: Notebook NbV4) -> notebookToPandoc opts notebook4 Left _ -> diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs index c638da519..9cdbf1611 100644 --- a/src/Text/Pandoc/Readers/JATS.hs +++ b/src/Text/Pandoc/Readers/JATS.hs @@ -14,7 +14,9 @@ Conversion of JATS XML to 'Pandoc' document. module Text.Pandoc.Readers.JATS ( readJATS ) where import Control.Monad.State.Strict -import Data.Char (isDigit, isSpace, toUpper) +import Control.Monad.Except (throwError) +import Text.Pandoc.Error (PandocError(..)) +import Data.Char (isDigit, isSpace) import Data.Default import Data.Generics import Data.List (foldl', intersperse) @@ -22,15 +24,17 @@ import qualified Data.Map as Map import Data.Maybe (maybeToList, fromMaybe) import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL import Text.HTML.TagSoup.Entity (lookupEntity) import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Options -import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces) +import Text.Pandoc.Shared (safeRead, extractSpaces) import Text.TeXMath (readMathML, writeTeX) -import Text.XML.Light +import Text.Pandoc.XML.Light import qualified Data.Set as S (fromList, member) import Data.Set ((\\)) +import Text.Pandoc.Sources (ToSources(..), sourcesToText) type JATS m = StateT JATSState m @@ -49,42 +53,28 @@ instance Default JATSState where , jatsContent = [] } -readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readJATS :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc readJATS _ inp = do - let tree = normalizeTree . parseXML - $ T.unpack $ crFilter inp + let sources = toSources inp + tree <- either (throwError . PandocXMLError "") return $ + parseXMLContents (TL.fromStrict . sourcesToText $ sources) (bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree return $ Pandoc (jatsMeta st') (toList . mconcat $ bs) --- normalize input, consolidating adjacent Text and CRef elements -normalizeTree :: [Content] -> [Content] -normalizeTree = everywhere (mkT go) - where go :: [Content] -> [Content] - go (Text (CData CDataRaw _ _):xs) = xs - go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs - go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs - go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs - go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs - go xs = xs - -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) - -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr = fromMaybe "" . maybeAttrValue attr -maybeAttrValue :: String -> Element -> Maybe Text +maybeAttrValue :: Text -> Element -> Maybe Text maybeAttrValue attr elt = - T.pack <$> lookupAttrBy (\x -> qName x == attr) (elAttribs elt) + lookupAttrBy (\x -> qName x == attr) (elAttribs elt) -- convenience function -named :: String -> Element -> Bool +named :: Text -> Element -> Bool named s e = qName (elName e) == s -- @@ -150,10 +140,10 @@ getBlocks e = mconcat <$> parseBlock :: PandocMonad m => Content -> JATS m Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE -parseBlock (Text (CData _ s _)) = if all isSpace s +parseBlock (Text (CData _ s _)) = if T.all isSpace s then return mempty - else return $ plain $ trimInlines $ text $ T.pack s -parseBlock (CRef x) = return $ plain $ str $ T.toUpper $ T.pack x + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ T.toUpper x parseBlock (Elem e) = case qName (elName e) of "p" -> parseMixed para (elContent e) @@ -202,7 +192,7 @@ parseBlock (Elem e) = "" -> [] x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) - $ trimNl $ textContentRecursive e + $ trimNl $ strContentRecursive e parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty @@ -266,7 +256,7 @@ parseBlock (Elem e) = Just "center" -> AlignCenter _ -> AlignDefault let toWidth c = do - w <- findAttrText (unqual "colwidth") c + w <- findAttr (unqual "colwidth") c n <- safeRead $ "0" <> T.filter (\x -> isDigit x || x == '.') w if n > 0 then Just n else Nothing let numrows = foldl' max 0 $ map length bodyrows @@ -437,16 +427,10 @@ parseRef e = do Nothing -> return $ Map.insert "id" (toMetaValue refId) mempty -- TODO handle mixed-citation -findAttrText :: QName -> Element -> Maybe Text -findAttrText x = fmap T.pack . findAttr x - textContent :: Element -> Text -textContent = T.pack . strContent - -textContentRecursive :: Element -> Text -textContentRecursive = T.pack . strContentRecursive +textContent = strContent -strContentRecursive :: Element -> String +strContentRecursive :: Element -> Text strContentRecursive = strContent . (\e' -> e'{ elContent = map elementToStr $ elContent e' }) @@ -455,9 +439,9 @@ elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing elementToStr x = x parseInline :: PandocMonad m => Content -> JATS m Inlines -parseInline (Text (CData _ s _)) = return $ text $ T.pack s -parseInline (CRef ref) = - return . text . maybe (T.toUpper $ T.pack ref) T.pack $ lookupEntity ref +parseInline (Text (CData _ s _)) = return $ text s +parseInline (CRef ref) = return $ maybe (text $ T.toUpper ref) (text . T.pack) + $ lookupEntity (T.unpack ref) parseInline (Elem e) = case qName (elName e) of "italic" -> innerInlines emph @@ -502,9 +486,9 @@ parseInline (Elem e) = else linkWith attr ("#" <> rid) "" ils "ext-link" -> do ils <- innerInlines id - let title = fromMaybe "" $ findAttrText (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e + let title = fromMaybe "" $ findAttr (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of - Just h -> T.pack h + Just h -> h _ -> "#" <> attrValue "rid" e let ils' = if ils == mempty then str href else ils let attr = (attrValue "id" e, [], []) @@ -512,7 +496,8 @@ parseInline (Elem e) = "disp-formula" -> formula displayMath "inline-formula" -> formula math - "math" | qPrefix (elName e) == Just "mml" -> return . math $ mathML e + "math" | qURI (elName e) == Just "http://www.w3.org/1998/Math/MathML" + -> return . math $ mathML e "tex-math" -> return . math $ textContent e "email" -> return $ link ("mailto:" <> textContent e) "" @@ -524,7 +509,7 @@ parseInline (Elem e) = where innerInlines f = extractSpaces f . mconcat <$> mapM parseInline (elContent e) mathML x = - case readMathML . T.pack . showElement $ everywhere (mkT removePrefix) x of + case readMathML . showElement $ everywhere (mkT removePrefix) x of Left _ -> mempty Right m -> writeTeX m formula constructor = do @@ -535,11 +520,12 @@ parseInline (Elem e) = filterChildren isMathML whereToLook return . mconcat . take 1 . map constructor $ texMaths ++ mathMLs - isMathML x = qName (elName x) == "math" && - qPrefix (elName x) == Just "mml" + isMathML x = qName (elName x) == "math" && + qURI (elName x) == + Just "http://www.w3.org/1998/Math/MathML" removePrefix elname = elname { qPrefix = Nothing } codeWithLang = do let classes' = case attrValue "language" e of "" -> [] l -> [l] - return $ codeWith (attrValue "id" e,classes',[]) $ textContentRecursive e + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e diff --git a/src/Text/Pandoc/Readers/Jira.hs b/src/Text/Pandoc/Readers/Jira.hs index 9266ce10d..cf111f173 100644 --- a/src/Text/Pandoc/Readers/Jira.hs +++ b/src/Text/Pandoc/Readers/Jira.hs @@ -2,7 +2,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org - Copyright : © 2019-2020 Albert Krewinkel + Copyright : © 2019-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -20,18 +20,20 @@ import Text.Pandoc.Builder hiding (cell) import Text.Pandoc.Error (PandocError (PandocParseError)) import Text.Pandoc.Options (ReaderOptions) import Text.Pandoc.Shared (stringify) - +import Text.Pandoc.Sources (ToSources(..), sourcesToText) import qualified Text.Jira.Markup as Jira -- | Read Jira wiki markup. -readJira :: PandocMonad m +readJira :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc -readJira _opts s = case parse s of - Right d -> return $ jiraToPandoc d - Left e -> throwError . PandocParseError $ - "Jira parse error" `append` pack (show e) +readJira _opts inp = do + let sources = toSources inp + case parse (sourcesToText sources) of + Right d -> return $ jiraToPandoc d + Left e -> throwError . PandocParseError $ + "Jira parse error" `append` pack (show e) jiraToPandoc :: Jira.Doc -> Pandoc jiraToPandoc (Jira.Doc blks) = doc $ foldMap jiraToPandocBlocks blks @@ -71,10 +73,10 @@ toPandocCodeBlocks langMay params txt = Nothing -> [] in codeBlockWith ("", classes, map paramToPair params) txt --- | Create a pandoc @'Div'@ +-- | Create a pandoc @'Div'@ from a panel. toPandocDiv :: [Jira.Parameter] -> [Jira.Block] -> Blocks toPandocDiv params = - divWith ("", [], map paramToPair params) . foldMap jiraToPandocBlocks + divWith ("", ["panel"], map paramToPair params) . foldMap jiraToPandocBlocks paramToPair :: Jira.Parameter -> (Text, Text) paramToPair (Jira.Parameter key value) = (key, value) @@ -170,6 +172,8 @@ jiraLinkToPandoc linkType alias url = Jira.Email -> link ("mailto:" <> url') "" alias' Jira.Attachment -> linkWith ("", ["attachment"], []) url' "" alias' Jira.User -> linkWith ("", ["user-account"], []) url' "" alias' + Jira.SmartCard -> linkWith ("", ["smart-card"], []) url' "" alias' + Jira.SmartLink -> linkWith ("", ["smart-link"], []) url' "" alias' -- | Get unicode representation of a Jira icon. iconUnicode :: Jira.Icon -> Text diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index cdccaa535..27c018e73 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1,14 +1,10 @@ -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE CPP #-} -{-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.LaTeX - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -22,50 +18,58 @@ module Text.Pandoc.Readers.LaTeX ( readLaTeX, applyMacros, rawLaTeXInline, rawLaTeXBlock, - inlineCommand, - tokenize, - untokenize + inlineCommand ) where import Control.Applicative (many, optional, (<|>)) import Control.Monad import Control.Monad.Except (throwError) -import Data.Char (isDigit, isLetter, toUpper, chr) +import Data.Char (isDigit, isLetter, isAlphaNum, toUpper, chr) import Data.Default -import Data.Functor (($>)) import Data.List (intercalate) import qualified Data.Map as M import Data.Maybe (fromMaybe, maybeToList) import qualified Data.Set as Set -import qualified Data.Sequence as Seq import Data.Text (Text) import qualified Data.Text as T +import Skylighting (defaultSyntaxMap) import System.FilePath (addExtension, replaceExtension, takeExtension) -import Text.Pandoc.BCP47 (Lang (..), renderLang) -import Text.Pandoc.Builder +import Text.Collate.Lang (renderLang) +import Text.Pandoc.Builder as B import Text.Pandoc.Class.PandocPure (PandocPure) import Text.Pandoc.Class.PandocMonad (PandocMonad (..), getResourcePath, - readFileFromDirs, report, setResourcePath, - setTranslations, translateTerm) + readFileFromDirs, report, + setResourcePath) import Text.Pandoc.Error (PandocError (PandocParseError, PandocParsecError)) -import Text.Pandoc.Highlighting (fromListingsLanguage, languagesByExtension) +import Text.Pandoc.Highlighting (languagesByExtension) import Text.Pandoc.ImageSize (numUnit, showFl) import Text.Pandoc.Logging import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline, optional, space, spaces, withRaw, (<|>)) -import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..), - ArgSpec (..), Tok (..), TokType (..)) +import Text.Pandoc.Readers.LaTeX.Types (Tok (..), TokType (..)) import Text.Pandoc.Readers.LaTeX.Parsing -import Text.Pandoc.Readers.LaTeX.Lang (polyglossiaLangToBCP47, - babelLangToBCP47) -import Text.Pandoc.Readers.LaTeX.SIunitx +import Text.Pandoc.Readers.LaTeX.Citation (citationCommands, cites) +import Text.Pandoc.Readers.LaTeX.Math (dollarsMath, inlineEnvironments, + inlineEnvironment, + mathDisplay, mathInline, + newtheorem, theoremstyle, proof, + theoremEnvironment) +import Text.Pandoc.Readers.LaTeX.Table (tableEnvironments) +import Text.Pandoc.Readers.LaTeX.Macro (macroDef) +import Text.Pandoc.Readers.LaTeX.Lang (inlineLanguageCommands, + enquoteCommands, + babelLangToBCP47, setDefaultLanguage) +import Text.Pandoc.Readers.LaTeX.SIunitx (siunitxCommands) +import Text.Pandoc.Readers.LaTeX.Inline (acronymCommands, refCommands, + nameCommands, charCommands, + accentCommands, + biblatexInlineCommands, + verbCommands, rawInlineOr, + listingsLanguage) import Text.Pandoc.Shared -import qualified Text.Pandoc.Translations as Translations import Text.Pandoc.Walk -import qualified Text.Pandoc.Builder as B -import qualified Data.Text.Normalize as Normalize -import Safe +import Data.List.NonEmpty (nonEmpty) -- for debugging: -- import Text.Pandoc.Extensions (getDefaultExtensions) @@ -73,16 +77,17 @@ import Safe -- import Debug.Trace (traceShowId) -- | Parse LaTeX from string and return 'Pandoc' document. -readLaTeX :: PandocMonad m +readLaTeX :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assumes @'\n'@ line endings) + -> a -- ^ Input to parse -> m Pandoc readLaTeX opts ltx = do + let sources = toSources ltx parsed <- runParserT parseLaTeX def{ sOptions = opts } "source" - (tokenize "source" (crFilter ltx)) + (tokenizeSources sources) case parsed of Right result -> return result - Left e -> throwError $ PandocParsecError ltx e + Left e -> throwError $ PandocParsecError sources e parseLaTeX :: PandocMonad m => LP m Pandoc parseLaTeX = do @@ -93,11 +98,7 @@ parseLaTeX = do let doc' = doc bs let headerLevel (Header n _ _) = [n] headerLevel _ = [] -#if MIN_VERSION_safe(0,3,18) - let bottomLevel = minimumBound 1 $ query headerLevel doc' -#else - let bottomLevel = minimumDef 1 $ query headerLevel doc' -#endif + let bottomLevel = maybe 1 minimum $ nonEmpty $ query headerLevel doc' let adjustHeaders m (Header n attr ils) = Header (n+m) attr ils adjustHeaders _ x = x let (Pandoc _ bs') = @@ -132,11 +133,10 @@ resolveRefs _ x = x rawLaTeXBlock :: (PandocMonad m, HasMacros s, HasReaderOptions s) - => ParserT Text s m Text + => ParserT Sources s m Text rawLaTeXBlock = do lookAhead (try (char '\\' >> letter)) - inp <- getInput - let toks = tokenize "source" inp + toks <- getInputTokens snd <$> (rawLaTeXParser toks False (macroDef (const mempty)) blocks <|> rawLaTeXParser toks True (do choice (map controlSeq @@ -163,14 +163,13 @@ beginOrEndCommand = try $ do (txt <> untokenize rawargs) rawLaTeXInline :: (PandocMonad m, HasMacros s, HasReaderOptions s) - => ParserT Text s m Text + => ParserT Sources s m Text rawLaTeXInline = do lookAhead (try (char '\\' >> letter)) - inp <- getInput - let toks = tokenize "source" inp + toks <- getInputTokens raw <- snd <$> ( rawLaTeXParser toks True - (mempty <$ (controlSeq "input" >> skipMany opt >> braced)) + (mempty <$ (controlSeq "input" >> skipMany rawopt >> braced)) inlines <|> rawLaTeXParser toks True (inlineEnvironment <|> inlineCommand') inlines @@ -178,11 +177,10 @@ rawLaTeXInline = do finalbraces <- mconcat <$> many (try (string "{}")) -- see #5439 return $ raw <> T.pack finalbraces -inlineCommand :: PandocMonad m => ParserT Text ParserState m Inlines +inlineCommand :: PandocMonad m => ParserT Sources ParserState m Inlines inlineCommand = do lookAhead (try (char '\\' >> letter)) - inp <- getInput - let toks = tokenize "source" inp + toks <- getInputTokens fst <$> rawLaTeXParser toks True (inlineEnvironment <|> inlineCommand') inlines @@ -191,12 +189,6 @@ inlineCommand = do word :: PandocMonad m => LP m Inlines word = str . untoken <$> satisfyTok isWordTok -regularSymbol :: PandocMonad m => LP m Inlines -regularSymbol = str . untoken <$> satisfyTok isRegularSymbol - where isRegularSymbol (Tok _ Symbol t) = not $ T.any isSpecial t - isRegularSymbol _ = False - isSpecial c = c `Set.member` specialChars - inlineGroup :: PandocMonad m => LP m Inlines inlineGroup = do ils <- grouped inline @@ -237,19 +229,6 @@ mkImage options (T.unpack -> src) = do _ -> return src return $ imageWith attr (T.pack src') "" alt -doxspace :: PandocMonad m => LP m Inlines -doxspace = - (space <$ lookAhead (satisfyTok startsWithLetter)) <|> return mempty - where startsWithLetter (Tok _ Word t) = - case T.uncons t of - Just (c, _) | isLetter c -> True - _ -> False - startsWithLetter _ = False - - -lit :: Text -> LP m Inlines -lit = pure . str - removeDoubleQuotes :: Text -> Text removeDoubleQuotes t = Data.Maybe.fromMaybe t $ T.stripPrefix "\"" t >>= T.stripSuffix "\"" @@ -296,23 +275,14 @@ quoted' f starter ender = do cs -> cs) else lit startchs -enquote :: PandocMonad m => Bool -> Maybe Text -> LP m Inlines -enquote starred mblang = do - skipopts - let lang = mblang >>= babelLangToBCP47 - let langspan = case lang of - Nothing -> id - Just l -> spanWith ("",[],[("lang", renderLang l)]) - quoteContext <- sQuoteContext <$> getState - if starred || quoteContext == InDoubleQuote - then singleQuoted . langspan <$> withQuoteContext InSingleQuote tok - else doubleQuoted . langspan <$> withQuoteContext InDoubleQuote tok +lit :: Text -> LP m Inlines +lit = pure . str blockquote :: PandocMonad m => Bool -> Maybe Text -> LP m Blocks blockquote cvariant mblang = do citepar <- if cvariant then (\xs -> para (cite xs mempty)) - <$> cites NormalCitation False + <$> cites inline NormalCitation False else option mempty $ para <$> bracketed inline let lang = mblang >>= babelLangToBCP47 let langdiv = case lang of @@ -323,224 +293,13 @@ blockquote cvariant mblang = do optional $ symbolIn (".:;?!" :: [Char]) -- currently ignored return $ blockQuote . langdiv $ (bs <> citepar) -doAcronym :: PandocMonad m => Text -> LP m Inlines -doAcronym form = do - acro <- braced - return . mconcat $ [spanWith ("",[],[("acronym-label", untokenize acro), - ("acronym-form", "singular+" <> form)]) - $ str $ untokenize acro] - -doAcronymPlural :: PandocMonad m => Text -> LP m Inlines -doAcronymPlural form = do - acro <- braced - plural <- lit "s" - return . mconcat $ [spanWith ("",[],[("acronym-label", untokenize acro), - ("acronym-form", "plural+" <> form)]) $ - mconcat [str $ untokenize acro, plural]] - -doverb :: PandocMonad m => LP m Inlines -doverb = do - Tok _ Symbol t <- anySymbol - marker <- case T.uncons t of - Just (c, ts) | T.null ts -> return c - _ -> mzero - withVerbatimMode $ - code . untokenize <$> - manyTill (notFollowedBy newlineTok >> verbTok marker) (symbol marker) - -verbTok :: PandocMonad m => Char -> LP m Tok -verbTok stopchar = do - t@(Tok pos toktype txt) <- anyTok - case T.findIndex (== stopchar) txt of - Nothing -> return t - Just i -> do - let (t1, t2) = T.splitAt i txt - inp <- getInput - setInput $ Tok (incSourceColumn pos i) Symbol (T.singleton stopchar) - : totoks (incSourceColumn pos (i + 1)) (T.drop 1 t2) ++ inp - return $ Tok pos toktype t1 - -listingsLanguage :: [(Text, Text)] -> Maybe Text -listingsLanguage opts = - case lookup "language" opts of - Nothing -> Nothing - Just l -> fromListingsLanguage l `mplus` Just l - -dolstinline :: PandocMonad m => LP m Inlines -dolstinline = do - options <- option [] keyvals - let classes = maybeToList $ listingsLanguage options - doinlinecode classes - -domintinline :: PandocMonad m => LP m Inlines -domintinline = do - skipopts - cls <- untokenize <$> braced - doinlinecode [cls] - -doinlinecode :: PandocMonad m => [Text] -> LP m Inlines -doinlinecode classes = do - Tok _ Symbol t <- anySymbol - marker <- case T.uncons t of - Just (c, ts) | T.null ts -> return c - _ -> mzero - let stopchar = if marker == '{' then '}' else marker - withVerbatimMode $ - codeWith ("",classes,[]) . T.map nlToSpace . untokenize <$> - manyTill (verbTok stopchar) (symbol stopchar) - -nlToSpace :: Char -> Char -nlToSpace '\n' = ' ' -nlToSpace x = x - -mathDisplay :: Text -> Inlines -mathDisplay = displayMath . trimMath - -mathInline :: Text -> Inlines -mathInline = math . trimMath - -dollarsMath :: PandocMonad m => LP m Inlines -dollarsMath = do - symbol '$' - display <- option False (True <$ symbol '$') - (do contents <- try $ untokenize <$> pDollarsMath 0 - if display - then mathDisplay contents <$ symbol '$' - else return $ mathInline contents) - <|> (guard display >> return (mathInline "")) - --- Int is number of embedded groupings -pDollarsMath :: PandocMonad m => Int -> LP m [Tok] -pDollarsMath n = do - tk@(Tok _ toktype t) <- anyTok - case toktype of - Symbol | t == "$" - , n == 0 -> return [] - | t == "\\" -> do - tk' <- anyTok - (tk :) . (tk' :) <$> pDollarsMath n - | t == "{" -> (tk :) <$> pDollarsMath (n+1) - | t == "}" -> - if n > 0 - then (tk :) <$> pDollarsMath (n-1) - else mzero - _ -> (tk :) <$> pDollarsMath n - --- citations - -addPrefix :: [Inline] -> [Citation] -> [Citation] -addPrefix p (k:ks) = k {citationPrefix = p ++ citationPrefix k} : ks -addPrefix _ _ = [] - -addSuffix :: [Inline] -> [Citation] -> [Citation] -addSuffix s ks@(_:_) = - let k = last ks - in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] -addSuffix _ _ = [] - -simpleCiteArgs :: PandocMonad m => LP m [Citation] -simpleCiteArgs = try $ do - first <- optionMaybe $ toList <$> opt - second <- optionMaybe $ toList <$> opt - keys <- try $ bgroup *> manyTill citationLabel egroup - let (pre, suf) = case (first , second ) of - (Just s , Nothing) -> (mempty, s ) - (Just s , Just t ) -> (s , t ) - _ -> (mempty, mempty) - conv k = Citation { citationId = k - , citationPrefix = [] - , citationSuffix = [] - , citationMode = NormalCitation - , citationHash = 0 - , citationNoteNum = 0 - } - return $ addPrefix pre $ addSuffix suf $ map conv keys - -citationLabel :: PandocMonad m => LP m Text -citationLabel = do - sp - untokenize <$> - (many1 (satisfyTok isWordTok <|> symbolIn bibtexKeyChar) - <* sp - <* optional (symbol ',') - <* sp) - where bibtexKeyChar = ".:;?!`'()/*@_+=-&[]" :: [Char] - -cites :: PandocMonad m => CitationMode -> Bool -> LP m [Citation] -cites mode multi = try $ do - cits <- if multi - then do - multiprenote <- optionMaybe $ toList <$> paropt - multipostnote <- optionMaybe $ toList <$> paropt - let (pre, suf) = case (multiprenote, multipostnote) of - (Just s , Nothing) -> (mempty, s) - (Nothing , Just t) -> (mempty, t) - (Just s , Just t ) -> (s, t) - _ -> (mempty, mempty) - tempCits <- many1 simpleCiteArgs - case tempCits of - (k:ks) -> case ks of - (_:_) -> return $ (addMprenote pre k : init ks) ++ - [addMpostnote suf (last ks)] - _ -> return [addMprenote pre (addMpostnote suf k)] - _ -> return [[]] - else count 1 simpleCiteArgs - let cs = concat cits - return $ case mode of - AuthorInText -> case cs of - (c:rest) -> c {citationMode = mode} : rest - [] -> [] - _ -> map (\a -> a {citationMode = mode}) cs - where mprenote (k:ks) = (k:ks) ++ [Space] - mprenote _ = mempty - mpostnote (k:ks) = [Str ",", Space] ++ (k:ks) - mpostnote _ = mempty - addMprenote mpn (k:ks) = - let mpnfinal = case citationPrefix k of - (_:_) -> mprenote mpn - _ -> mpn - in addPrefix mpnfinal (k:ks) - addMprenote _ _ = [] - addMpostnote = addSuffix . mpostnote - -citation :: PandocMonad m => Text -> CitationMode -> Bool -> LP m Inlines -citation name mode multi = do - (c,raw) <- withRaw $ cites mode multi - return $ cite c (rawInline "latex" $ "\\" <> name <> untokenize raw) - -handleCitationPart :: Inlines -> [Citation] -handleCitationPart ils = - let isCite Cite{} = True - isCite _ = False - (pref, rest) = break isCite (toList ils) - in case rest of - (Cite cs _:suff) -> addPrefix pref $ addSuffix suff cs - _ -> [] - -complexNatbibCitation :: PandocMonad m => CitationMode -> LP m Inlines -complexNatbibCitation mode = try $ do - (cs, raw) <- - withRaw $ concat <$> do - bgroup - items <- mconcat <$> - many1 (notFollowedBy (symbol ';') >> inline) - `sepBy1` symbol ';' - egroup - return $ map handleCitationPart items - case cs of - [] -> mzero - (c:cits) -> return $ cite (c{ citationMode = mode }:cits) - (rawInline "latex" $ "\\citetext" <> untokenize raw) - -inNote :: Inlines -> Inlines -inNote ils = - note $ para $ ils <> str "." - inlineCommand' :: PandocMonad m => LP m Inlines inlineCommand' = try $ do Tok _ (CtrlSeq name) cmd <- anyControlSeq guard $ name /= "begin" && name /= "end" && name /= "and" - star <- option "" ("*" <$ symbol '*' <* sp) + star <- if T.all isAlphaNum name + then option "" ("*" <$ symbol '*' <* sp) + else pure "" overlay <- option "" overlaySpecification let name' = name <> star <> overlay let names = ordNub [name', name] -- check non-starred as fallback @@ -551,28 +310,8 @@ inlineCommand' = try $ do <|> ignore rawcommand lookupListDefault raw names inlineCommands - tok :: PandocMonad m => LP m Inlines -tok = try $ spaces >> grouped inline <|> inlineCommand' <|> singleChar' - where singleChar' = do - Tok _ _ t <- singleChar - return $ str t - -opt :: PandocMonad m => LP m Inlines -opt = do - toks <- try (sp *> bracketedToks <* sp) - -- now parse the toks as inlines - st <- getState - parsed <- runParserT (mconcat <$> many inline) st "bracketed option" toks - case parsed of - Right result -> return result - Left e -> throwError $ PandocParsecError (untokenize toks) e - -paropt :: PandocMonad m => LP m Inlines -paropt = parenWrapped inline - -inBrackets :: Inlines -> Inlines -inBrackets x = str "[" <> x <> str "]" +tok = tokWith inline unescapeURL :: Text -> Text unescapeURL = T.concat . go . T.splitOn "\\" @@ -585,381 +324,109 @@ unescapeURL = T.concat . go . T.splitOn "\\" , isEscapable c = t | otherwise = "\\" <> t -mathEnvWith :: PandocMonad m - => (Inlines -> a) -> Maybe Text -> Text -> LP m a -mathEnvWith f innerEnv name = f . mathDisplay . inner <$> mathEnv name - where inner x = case innerEnv of - Nothing -> x - Just y -> "\\begin{" <> y <> "}\n" <> x <> - "\\end{" <> y <> "}" - -mathEnv :: PandocMonad m => Text -> LP m Text -mathEnv name = do - skipopts - optional blankline - res <- manyTill anyTok (end_ name) - return $ stripTrailingNewlines $ untokenize res - -inlineEnvironment :: PandocMonad m => LP m Inlines -inlineEnvironment = try $ do - controlSeq "begin" - name <- untokenize <$> braced - M.findWithDefault mzero name inlineEnvironments - -inlineEnvironments :: PandocMonad m => M.Map Text (LP m Inlines) -inlineEnvironments = M.fromList [ - ("displaymath", mathEnvWith id Nothing "displaymath") - , ("math", math <$> mathEnv "math") - , ("equation", mathEnvWith id Nothing "equation") - , ("equation*", mathEnvWith id Nothing "equation*") - , ("gather", mathEnvWith id (Just "gathered") "gather") - , ("gather*", mathEnvWith id (Just "gathered") "gather*") - , ("multline", mathEnvWith id (Just "gathered") "multline") - , ("multline*", mathEnvWith id (Just "gathered") "multline*") - , ("eqnarray", mathEnvWith id (Just "aligned") "eqnarray") - , ("eqnarray*", mathEnvWith id (Just "aligned") "eqnarray*") - , ("align", mathEnvWith id (Just "aligned") "align") - , ("align*", mathEnvWith id (Just "aligned") "align*") - , ("alignat", mathEnvWith id (Just "aligned") "alignat") - , ("alignat*", mathEnvWith id (Just "aligned") "alignat*") - , ("dmath", mathEnvWith id Nothing "dmath") - , ("dmath*", mathEnvWith id Nothing "dmath*") - , ("dgroup", mathEnvWith id (Just "aligned") "dgroup") - , ("dgroup*", mathEnvWith id (Just "aligned") "dgroup*") - , ("darray", mathEnvWith id (Just "aligned") "darray") - , ("darray*", mathEnvWith id (Just "aligned") "darray*") - ] - inlineCommands :: PandocMonad m => M.Map Text (LP m Inlines) -inlineCommands = M.union inlineLanguageCommands $ M.fromList - [ ("emph", extractSpaces emph <$> tok) - , ("textit", extractSpaces emph <$> tok) - , ("textsl", extractSpaces emph <$> tok) - , ("textsc", extractSpaces smallcaps <$> tok) - , ("textsf", extractSpaces (spanWith ("",["sans-serif"],[])) <$> tok) - , ("textmd", extractSpaces (spanWith ("",["medium"],[])) <$> tok) - , ("textrm", extractSpaces (spanWith ("",["roman"],[])) <$> tok) - , ("textup", extractSpaces (spanWith ("",["upright"],[])) <$> tok) - , ("texttt", ttfamily) - , ("sout", extractSpaces strikeout <$> tok) - , ("alert", skipopts >> spanWith ("",["alert"],[]) <$> tok) -- beamer - , ("lq", return (str "‘")) - , ("rq", return (str "’")) - , ("textquoteleft", return (str "‘")) - , ("textquoteright", return (str "’")) - , ("textquotedblleft", return (str "“")) - , ("textquotedblright", return (str "”")) - , ("textsuperscript", extractSpaces superscript <$> tok) - , ("textsubscript", extractSpaces subscript <$> tok) - , ("textbackslash", lit "\\") - , ("backslash", lit "\\") - , ("slash", lit "/") - , ("textbf", extractSpaces strong <$> tok) - , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok) - , ("underline", underline <$> tok) - , ("ldots", lit "…") - , ("vdots", lit "\8942") - , ("dots", lit "…") - , ("mdots", lit "…") - , ("sim", lit "~") - , ("sep", lit ",") - , ("label", rawInlineOr "label" dolabel) - , ("ref", rawInlineOr "ref" $ doref "ref") - , ("cref", rawInlineOr "cref" $ doref "ref") -- from cleveref.sty - , ("vref", rawInlineOr "vref" $ doref "ref+page") -- from varioref.sty - , ("eqref", rawInlineOr "eqref" $ doref "eqref") -- from amsmath.sty - , ("mbox", rawInlineOr "mbox" $ processHBox <$> tok) - , ("hbox", rawInlineOr "hbox" $ processHBox <$> tok) - , ("lettrine", rawInlineOr "lettrine" lettrine) - , ("(", mathInline . untokenize <$> manyTill anyTok (controlSeq ")")) - , ("[", mathDisplay . untokenize <$> manyTill anyTok (controlSeq "]")) - , ("ensuremath", mathInline . untokenize <$> braced) - , ("texorpdfstring", const <$> tok <*> tok) - , ("P", lit "¶") - , ("S", lit "§") - , ("$", lit "$") - , ("%", lit "%") - , ("&", lit "&") - , ("#", lit "#") - , ("_", lit "_") - , ("{", lit "{") - , ("}", lit "}") - , ("qed", lit "\a0\x25FB") - -- old TeX commands - , ("em", extractSpaces emph <$> inlines) - , ("it", extractSpaces emph <$> inlines) - , ("sl", extractSpaces emph <$> inlines) - , ("bf", extractSpaces strong <$> inlines) - , ("tt", code . stringify . toList <$> inlines) - , ("rm", inlines) - , ("itshape", extractSpaces emph <$> inlines) - , ("slshape", extractSpaces emph <$> inlines) - , ("scshape", extractSpaces smallcaps <$> inlines) - , ("bfseries", extractSpaces strong <$> inlines) - , ("MakeUppercase", makeUppercase <$> tok) - , ("MakeTextUppercase", makeUppercase <$> tok) -- textcase - , ("uppercase", makeUppercase <$> tok) - , ("MakeLowercase", makeLowercase <$> tok) - , ("MakeTextLowercase", makeLowercase <$> tok) - , ("lowercase", makeLowercase <$> tok) - , ("/", pure mempty) -- italic correction - , ("aa", lit "å") - , ("AA", lit "Å") - , ("ss", lit "ß") - , ("o", lit "ø") - , ("O", lit "Ø") - , ("L", lit "Ł") - , ("l", lit "ł") - , ("ae", lit "æ") - , ("AE", lit "Æ") - , ("oe", lit "œ") - , ("OE", lit "Œ") - , ("pounds", lit "£") - , ("euro", lit "€") - , ("copyright", lit "©") - , ("textasciicircum", lit "^") - , ("textasciitilde", lit "~") - , ("H", accent '\779' Nothing) -- hungarumlaut - , ("`", accent '\768' (Just '`')) -- grave - , ("'", accent '\769' (Just '\'')) -- acute - , ("^", accent '\770' (Just '^')) -- circ - , ("~", accent '\771' (Just '~')) -- tilde - , ("\"", accent '\776' Nothing) -- umlaut - , (".", accent '\775' Nothing) -- dot - , ("=", accent '\772' Nothing) -- macron - , ("|", accent '\781' Nothing) -- vertical line above - , ("b", accent '\817' Nothing) -- macron below - , ("c", accent '\807' Nothing) -- cedilla - , ("G", accent '\783' Nothing) -- doublegrave - , ("h", accent '\777' Nothing) -- hookabove - , ("d", accent '\803' Nothing) -- dotbelow - , ("f", accent '\785' Nothing) -- inverted breve - , ("r", accent '\778' Nothing) -- ringabove - , ("t", accent '\865' Nothing) -- double inverted breve - , ("U", accent '\782' Nothing) -- double vertical line above - , ("v", accent '\780' Nothing) -- hacek - , ("u", accent '\774' Nothing) -- breve - , ("k", accent '\808' Nothing) -- ogonek - , ("textogonekcentered", accent '\808' Nothing) -- ogonek - , ("i", lit "ı") -- dotless i - , ("j", lit "ȷ") -- dotless j - , ("newtie", accent '\785' Nothing) -- inverted breve - , ("textcircled", accent '\8413' Nothing) -- combining circle - , ("\\", linebreak <$ (do inTableCell <- sInTableCell <$> getState - guard $ not inTableCell - optional opt - spaces)) - , (",", lit "\8198") - , ("@", pure mempty) - , (" ", lit "\160") - , ("ps", pure $ str "PS." <> space) - , ("TeX", lit "TeX") - , ("LaTeX", lit "LaTeX") - , ("bar", lit "|") - , ("textless", lit "<") - , ("textgreater", lit ">") - , ("thanks", skipopts >> note <$> grouped block) - , ("footnote", skipopts >> note <$> grouped block) - , ("passthrough", tok) -- \passthrough macro used by latex writer - -- for listings - , ("verb", doverb) - , ("lstinline", dolstinline) - , ("mintinline", domintinline) - , ("Verb", doverb) - , ("url", (\url -> link url "" (str url)) . unescapeURL . untokenize <$> - bracedUrl) - , ("nolinkurl", code . unescapeURL . untokenize <$> bracedUrl) - , ("href", do url <- bracedUrl - sp - link (unescapeURL $ untokenize url) "" <$> tok) - , ("includegraphics", do options <- option [] keyvals - src <- braced - mkImage options . unescapeURL . removeDoubleQuotes $ - untokenize src) - , ("enquote*", enquote True Nothing) - , ("enquote", enquote False Nothing) - -- foreignquote is supposed to use native quote marks - , ("foreignquote*", braced >>= enquote True . Just . untokenize) - , ("foreignquote", braced >>= enquote False . Just . untokenize) - -- hypehnquote uses regular quotes - , ("hyphenquote*", braced >>= enquote True . Just . untokenize) - , ("hyphenquote", braced >>= enquote False . Just . untokenize) - , ("figurename", doTerm Translations.Figure) - , ("prefacename", doTerm Translations.Preface) - , ("refname", doTerm Translations.References) - , ("bibname", doTerm Translations.Bibliography) - , ("chaptername", doTerm Translations.Chapter) - , ("partname", doTerm Translations.Part) - , ("contentsname", doTerm Translations.Contents) - , ("listfigurename", doTerm Translations.ListOfFigures) - , ("listtablename", doTerm Translations.ListOfTables) - , ("indexname", doTerm Translations.Index) - , ("abstractname", doTerm Translations.Abstract) - , ("tablename", doTerm Translations.Table) - , ("enclname", doTerm Translations.Encl) - , ("ccname", doTerm Translations.Cc) - , ("headtoname", doTerm Translations.To) - , ("pagename", doTerm Translations.Page) - , ("seename", doTerm Translations.See) - , ("seealsoname", doTerm Translations.SeeAlso) - , ("proofname", doTerm Translations.Proof) - , ("glossaryname", doTerm Translations.Glossary) - , ("lstlistingname", doTerm Translations.Listing) - , ("cite", citation "cite" NormalCitation False) - , ("Cite", citation "Cite" NormalCitation False) - , ("citep", citation "citep" NormalCitation False) - , ("citep*", citation "citep*" NormalCitation False) - , ("citeal", citation "citeal" NormalCitation False) - , ("citealp", citation "citealp" NormalCitation False) - , ("citealp*", citation "citealp*" NormalCitation False) - , ("autocite", citation "autocite" NormalCitation False) - , ("smartcite", citation "smartcite" NormalCitation False) - , ("footcite", inNote <$> citation "footcite" NormalCitation False) - , ("parencite", citation "parencite" NormalCitation False) - , ("supercite", citation "supercite" NormalCitation False) - , ("footcitetext", inNote <$> citation "footcitetext" NormalCitation False) - , ("citeyearpar", citation "citeyearpar" SuppressAuthor False) - , ("citeyear", citation "citeyear" SuppressAuthor False) - , ("autocite*", citation "autocite*" SuppressAuthor False) - , ("cite*", citation "cite*" SuppressAuthor False) - , ("parencite*", citation "parencite*" SuppressAuthor False) - , ("textcite", citation "textcite" AuthorInText False) - , ("citet", citation "citet" AuthorInText False) - , ("citet*", citation "citet*" AuthorInText False) - , ("citealt", citation "citealt" AuthorInText False) - , ("citealt*", citation "citealt*" AuthorInText False) - , ("textcites", citation "textcites" AuthorInText True) - , ("cites", citation "cites" NormalCitation True) - , ("autocites", citation "autocites" NormalCitation True) - , ("footcites", inNote <$> citation "footcites" NormalCitation True) - , ("parencites", citation "parencites" NormalCitation True) - , ("supercites", citation "supercites" NormalCitation True) - , ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True) - , ("Autocite", citation "Autocite" NormalCitation False) - , ("Smartcite", citation "Smartcite" NormalCitation False) - , ("Footcite", inNote <$> citation "Footcite" NormalCitation False) - , ("Parencite", citation "Parencite" NormalCitation False) - , ("Supercite", citation "Supercite" NormalCitation False) - , ("Footcitetext", inNote <$> citation "Footcitetext" NormalCitation False) - , ("Citeyearpar", citation "Citeyearpar" SuppressAuthor False) - , ("Citeyear", citation "Citeyear" SuppressAuthor False) - , ("Autocite*", citation "Autocite*" SuppressAuthor False) - , ("Cite*", citation "Cite*" SuppressAuthor False) - , ("Parencite*", citation "Parencite*" SuppressAuthor False) - , ("Textcite", citation "Textcite" AuthorInText False) - , ("Textcites", citation "Textcites" AuthorInText True) - , ("Cites", citation "Cites" NormalCitation True) - , ("Autocites", citation "Autocites" NormalCitation True) - , ("Footcites", inNote <$> citation "Footcites" NormalCitation True) - , ("Parencites", citation "Parencites" NormalCitation True) - , ("Supercites", citation "Supercites" NormalCitation True) - , ("Footcitetexts", inNote <$> citation "Footcitetexts" NormalCitation True) - , ("citetext", complexNatbibCitation NormalCitation) - , ("citeauthor", (try (tok *> sp *> controlSeq "citetext") *> - complexNatbibCitation AuthorInText) - <|> citation "citeauthor" AuthorInText False) - , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>= - addMeta "nocite")) - , ("hyperlink", hyperlink) - , ("hypertarget", hypertargetInline) - -- glossaries package - , ("gls", doAcronym "short") - , ("Gls", doAcronym "short") - , ("glsdesc", doAcronym "long") - , ("Glsdesc", doAcronym "long") - , ("GLSdesc", doAcronym "long") - , ("acrlong", doAcronym "long") - , ("Acrlong", doAcronym "long") - , ("acrfull", doAcronym "full") - , ("Acrfull", doAcronym "full") - , ("acrshort", doAcronym "abbrv") - , ("Acrshort", doAcronym "abbrv") - , ("glspl", doAcronymPlural "short") - , ("Glspl", doAcronymPlural "short") - , ("glsdescplural", doAcronymPlural "long") - , ("Glsdescplural", doAcronymPlural "long") - , ("GLSdescplural", doAcronymPlural "long") - -- acronyms package - , ("ac", doAcronym "short") - , ("acf", doAcronym "full") - , ("acs", doAcronym "abbrv") - , ("acl", doAcronym "long") - , ("acp", doAcronymPlural "short") - , ("acfp", doAcronymPlural "full") - , ("acsp", doAcronymPlural "abbrv") - , ("aclp", doAcronymPlural "long") - , ("Ac", doAcronym "short") - , ("Acf", doAcronym "full") - , ("Acs", doAcronym "abbrv") - , ("Acl", doAcronym "long") - , ("Acp", doAcronymPlural "short") - , ("Acfp", doAcronymPlural "full") - , ("Acsp", doAcronymPlural "abbrv") - , ("Aclp", doAcronymPlural "long") - -- siuntix - , ("si", skipopts *> dosi tok) - , ("SI", doSI tok) - , ("SIrange", doSIrange True tok) - , ("numrange", doSIrange False tok) - , ("numlist", doSInumlist) - , ("num", doSInum) - , ("ang", doSIang) - -- hyphenat - , ("bshyp", lit "\\\173") - , ("fshyp", lit "/\173") - , ("dothyp", lit ".\173") - , ("colonhyp", lit ":\173") - , ("hyp", lit "-") - , ("nohyphens", tok) - , ("textnhtt", ttfamily) - , ("nhttfamily", ttfamily) - -- LaTeX colors - , ("textcolor", coloredInline "color") - , ("colorbox", coloredInline "background-color") - -- fontawesome - , ("faCheck", lit "\10003") - , ("faClose", lit "\10007") - -- xspace - , ("xspace", doxspace) - -- etoolbox - , ("ifstrequal", ifstrequal) - , ("newtoggle", braced >>= newToggle) - , ("toggletrue", braced >>= setToggle True) - , ("togglefalse", braced >>= setToggle False) - , ("iftoggle", try $ ifToggle >> inline) - -- biblatex misc - , ("RN", romanNumeralUpper) - , ("Rn", romanNumeralLower) - -- babel - , ("foreignlanguage", foreignlanguage) - -- include - , ("input", rawInlineOr "input" $ include "input") - -- soul package - , ("ul", underline <$> tok) - -- ulem package - , ("uline", underline <$> tok) - -- plain tex stuff that should just be passed through as raw tex - , ("ifdim", ifdim) - -- stackengine - , ("addstackgap", skipopts *> tok) - ] - -accent :: PandocMonad m => Char -> Maybe Char -> LP m Inlines -accent combiningAccent fallBack = try $ do - ils <- tok - case toList ils of - (Str (T.uncons -> Just (x, xs)) : ys) -> return $ fromList $ - -- try to normalize to the combined character: - Str (Normalize.normalize Normalize.NFC - (T.pack [x, combiningAccent]) <> xs) : ys - [Space] -> return $ str $ T.singleton $ fromMaybe combiningAccent fallBack - [] -> return $ str $ T.singleton $ fromMaybe combiningAccent fallBack - _ -> return ils - +inlineCommands = M.unions + [ accentCommands tok + , citationCommands inline + , siunitxCommands tok + , acronymCommands + , refCommands + , nameCommands + , verbCommands + , charCommands + , enquoteCommands tok + , inlineLanguageCommands tok + , biblatexInlineCommands tok + , rest ] + where + rest = M.fromList + [ ("emph", extractSpaces emph <$> tok) + , ("textit", extractSpaces emph <$> tok) + , ("textsl", extractSpaces emph <$> tok) + , ("textsc", extractSpaces smallcaps <$> tok) + , ("textsf", extractSpaces (spanWith ("",["sans-serif"],[])) <$> tok) + , ("textmd", extractSpaces (spanWith ("",["medium"],[])) <$> tok) + , ("textrm", extractSpaces (spanWith ("",["roman"],[])) <$> tok) + , ("textup", extractSpaces (spanWith ("",["upright"],[])) <$> tok) + , ("texttt", ttfamily) + , ("sout", extractSpaces strikeout <$> tok) + , ("alert", skipopts >> spanWith ("",["alert"],[]) <$> tok) -- beamer + , ("textsuperscript", extractSpaces superscript <$> tok) + , ("textsubscript", extractSpaces subscript <$> tok) + , ("textbf", extractSpaces strong <$> tok) + , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok) + , ("underline", underline <$> tok) + , ("mbox", rawInlineOr "mbox" $ processHBox <$> tok) + , ("hbox", rawInlineOr "hbox" $ processHBox <$> tok) + , ("lettrine", rawInlineOr "lettrine" lettrine) + , ("(", mathInline . untokenize <$> manyTill anyTok (controlSeq ")")) + , ("[", mathDisplay . untokenize <$> manyTill anyTok (controlSeq "]")) + , ("ensuremath", mathInline . untokenize <$> braced) + , ("texorpdfstring", const <$> tok <*> tok) + -- old TeX commands + , ("em", extractSpaces emph <$> inlines) + , ("it", extractSpaces emph <$> inlines) + , ("sl", extractSpaces emph <$> inlines) + , ("bf", extractSpaces strong <$> inlines) + , ("tt", code . stringify . toList <$> inlines) + , ("rm", inlines) + , ("itshape", extractSpaces emph <$> inlines) + , ("slshape", extractSpaces emph <$> inlines) + , ("scshape", extractSpaces smallcaps <$> inlines) + , ("bfseries", extractSpaces strong <$> inlines) + , ("MakeUppercase", makeUppercase <$> tok) + , ("MakeTextUppercase", makeUppercase <$> tok) -- textcase + , ("uppercase", makeUppercase <$> tok) + , ("MakeLowercase", makeLowercase <$> tok) + , ("MakeTextLowercase", makeLowercase <$> tok) + , ("lowercase", makeLowercase <$> tok) + , ("thanks", skipopts >> note <$> grouped block) + , ("footnote", skipopts >> note <$> grouped block) + , ("passthrough", tok) -- \passthrough macro used by latex writer + -- for listings + , ("includegraphics", do options <- option [] keyvals + src <- braced + mkImage options . + unescapeURL . + removeDoubleQuotes $ untokenize src) + -- hyperref + , ("url", (\url -> link url "" (str url)) . unescapeURL . untokenize <$> + bracedUrl) + , ("nolinkurl", code . unescapeURL . untokenize <$> bracedUrl) + , ("href", do url <- bracedUrl + sp + link (unescapeURL $ untokenize url) "" <$> tok) + , ("hyperlink", hyperlink) + , ("hyperref", hyperref) + , ("hypertarget", hypertargetInline) + -- hyphenat + , ("nohyphens", tok) + , ("textnhtt", ttfamily) + , ("nhttfamily", ttfamily) + -- LaTeX colors + , ("textcolor", coloredInline "color") + , ("colorbox", coloredInline "background-color") + -- etoolbox + , ("ifstrequal", ifstrequal) + , ("newtoggle", braced >>= newToggle) + , ("toggletrue", braced >>= setToggle True) + , ("togglefalse", braced >>= setToggle False) + , ("iftoggle", try $ ifToggle >> inline) + -- include + , ("input", rawInlineOr "input" $ include "input") + -- soul package + , ("ul", underline <$> tok) + -- ulem package + , ("uline", underline <$> tok) + -- plain tex stuff that should just be passed through as raw tex + , ("ifdim", ifdim) + -- stackengine + , ("addstackgap", skipopts *> tok) + ] lettrine :: PandocMonad m => LP m Inlines lettrine = do - optional opt + optional rawopt x <- tok y <- tok return $ extractSpaces (spanWith ("",["lettrine"],[])) x <> smallcaps y @@ -979,32 +446,18 @@ alterStr :: (Text -> Text) -> Inline -> Inline alterStr f (Str xs) = Str (f xs) alterStr _ x = x -foreignlanguage :: PandocMonad m => LP m Inlines -foreignlanguage = do - babelLang <- untokenize <$> braced - case babelLangToBCP47 babelLang of - Just lang -> spanWith ("", [], [("lang", renderLang lang)]) <$> tok - _ -> tok - -inlineLanguageCommands :: PandocMonad m => M.Map Text (LP m Inlines) -inlineLanguageCommands = M.fromList $ mk <$> M.toList polyglossiaLangToBCP47 - where - mk (polyglossia, bcp47Func) = - ("text" <> polyglossia, inlineLanguage bcp47Func) - -inlineLanguage :: PandocMonad m => (Text -> Lang) -> LP m Inlines -inlineLanguage bcp47Func = do - o <- option "" $ T.filter (\c -> c /= '[' && c /= ']') - <$> rawopt - let lang = renderLang $ bcp47Func o - extractSpaces (spanWith ("", [], [("lang", lang)])) <$> tok - hyperlink :: PandocMonad m => LP m Inlines hyperlink = try $ do src <- untokenize <$> braced lab <- tok return $ link ("#" <> src) "" lab +hyperref :: PandocMonad m => LP m Inlines +hyperref = try $ do + url <- (("#" <>) . untokenize <$> try (sp *> bracketedToks <* sp)) + <|> untokenize <$> (bracedUrl <* bracedUrl <* bracedUrl) + link url "" <$> tok + hypertargetBlock :: PandocMonad m => LP m Blocks hypertargetBlock = try $ do ref <- untokenize <$> braced @@ -1019,31 +472,6 @@ hypertargetInline = try $ do ils <- grouped inline return $ spanWith (ref, [], []) ils -romanNumeralUpper :: (PandocMonad m) => LP m Inlines -romanNumeralUpper = - str . toRomanNumeral <$> romanNumeralArg - -romanNumeralLower :: (PandocMonad m) => LP m Inlines -romanNumeralLower = - str . T.toLower . toRomanNumeral <$> romanNumeralArg - -romanNumeralArg :: (PandocMonad m) => LP m Int -romanNumeralArg = spaces *> (parser <|> inBraces) - where - inBraces = do - symbol '{' - spaces - res <- parser - spaces - symbol '}' - return res - parser = do - Tok _ Word s <- satisfyTok isWordTok - let (digits, rest) = T.span isDigit s - unless (T.null rest) $ - Prelude.fail "Non-digits in argument to \\Rn or \\RN" - safeRead digits - newToggle :: (Monoid a, PandocMonad m) => [Tok] -> LP m a newToggle name = do updateState $ \st -> @@ -1074,9 +502,6 @@ ifToggle = do report $ UndefinedToggle name' pos return () -doTerm :: PandocMonad m => Translations.Term -> LP m Inlines -doTerm term = str <$> translateTerm term - ifstrequal :: (PandocMonad m, Monoid a) => LP m a ifstrequal = do str1 <- tok @@ -1097,13 +522,6 @@ coloredInline stylename = do ttfamily :: PandocMonad m => LP m Inlines ttfamily = code . stringify . toList <$> tok -rawInlineOr :: PandocMonad m => Text -> LP m Inlines -> LP m Inlines -rawInlineOr name' fallback = do - parseRaw <- extensionEnabled Ext_raw_tex <$> getOption readerExtensions - if parseRaw - then rawInline "latex" <$> getRawCommand name' ("\\" <> name') - else fallback - processHBox :: Inlines -> Inlines processHBox = walk convert where @@ -1154,79 +572,90 @@ treatAsInline = Set.fromList , "pagebreak" ] -label :: PandocMonad m => LP m () -label = do - controlSeq "label" - t <- braced - updateState $ \st -> st{ sLastLabel = Just $ untokenize t } - -dolabel :: PandocMonad m => LP m Inlines -dolabel = do - v <- braced - let refstr = untokenize v - updateState $ \st -> - st{ sLastLabel = Just refstr } - return $ spanWith (refstr,[],[("label", refstr)]) - $ inBrackets $ str $ untokenize v - -doref :: PandocMonad m => Text -> LP m Inlines -doref cls = do - v <- braced - let refstr = untokenize v - return $ linkWith ("",[],[ ("reference-type", cls) - , ("reference", refstr)]) - ("#" <> refstr) - "" - (inBrackets $ str refstr) - lookupListDefault :: (Ord k) => v -> [k] -> M.Map k v -> v lookupListDefault d = (fromMaybe d .) . lookupList where lookupList l m = msum $ map (`M.lookup` m) l inline :: PandocMonad m => LP m Inlines -inline = (mempty <$ comment) - <|> (space <$ whitespace) - <|> (softbreak <$ endline) - <|> word - <|> macroDef (rawInline "latex") - <|> inlineCommand' - <|> inlineEnvironment - <|> inlineGroup - <|> (symbol '-' *> - option (str "-") (symbol '-' *> - option (str "–") (str "—" <$ symbol '-'))) - <|> doubleQuote - <|> singleQuote - <|> (str "”" <$ try (symbol '\'' >> symbol '\'')) - <|> (str "”" <$ symbol '”') - <|> (str "’" <$ symbol '\'') - <|> (str "’" <$ symbol '’') - <|> (str "\160" <$ symbol '~') - <|> dollarsMath - <|> (guardEnabled Ext_literate_haskell *> symbol '|' *> doLHSverb) - <|> (str . T.singleton <$> primEscape) - <|> regularSymbol - <|> (do res <- symbolIn "#^'`\"[]&" - pos <- getPosition - let s = untoken res - report $ ParsingUnescaped s pos - return $ str s) +inline = do + Tok pos toktype t <- lookAhead anyTok + let symbolAsString = str . untoken <$> anySymbol + let unescapedSymbolAsString = + do s <- untoken <$> anySymbol + report $ ParsingUnescaped s pos + return $ str s + case toktype of + Comment -> mempty <$ comment + Spaces -> space <$ whitespace + Newline -> softbreak <$ endline + Word -> word + Esc1 -> str . T.singleton <$> primEscape + Esc2 -> str . T.singleton <$> primEscape + Symbol -> + case t of + "-" -> symbol '-' *> + option (str "-") (symbol '-' *> + option (str "–") (str "—" <$ symbol '-')) + "'" -> symbol '\'' *> + option (str "’") (str "”" <$ symbol '\'') + "~" -> str "\160" <$ symbol '~' + "`" -> doubleQuote <|> singleQuote <|> symbolAsString + "\"" -> doubleQuote <|> singleQuote <|> symbolAsString + "“" -> doubleQuote <|> symbolAsString + "‘" -> singleQuote <|> symbolAsString + "$" -> dollarsMath <|> unescapedSymbolAsString + "|" -> (guardEnabled Ext_literate_haskell *> + symbol '|' *> doLHSverb) <|> symbolAsString + "{" -> inlineGroup + "#" -> unescapedSymbolAsString + "&" -> unescapedSymbolAsString + "_" -> unescapedSymbolAsString + "^" -> unescapedSymbolAsString + "\\" -> mzero + "}" -> mzero + _ -> symbolAsString + CtrlSeq _ -> macroDef (rawInline "latex") + <|> inlineCommand' + <|> inlineEnvironment + <|> inlineGroup + _ -> mzero inlines :: PandocMonad m => LP m Inlines inlines = mconcat <$> many inline +opt :: PandocMonad m => LP m Inlines +opt = do + toks <- try (sp *> bracketedToks <* sp) + -- now parse the toks as inlines + st <- getState + parsed <- runParserT (mconcat <$> many inline) st "bracketed option" toks + case parsed of + Right result -> return result + Left e -> throwError $ PandocParsecError (toSources toks) e + -- block elements: preamble :: PandocMonad m => LP m Blocks preamble = mconcat <$> many preambleBlock where preambleBlock = (mempty <$ spaces1) <|> macroDef (rawBlock "latex") + <|> filecontents <|> (mempty <$ blockCommand) <|> (mempty <$ braced) <|> (do notFollowedBy (begin_ "document") anyTok return mempty) +rule :: PandocMonad m => LP m Blocks +rule = do + skipopts + width <- T.takeWhile (\c -> isDigit c || c == '.') . stringify <$> tok + _thickness <- tok + -- 0-width rules are used to fix spacing issues: + case safeRead width of + Just (0 :: Double) -> return mempty + _ -> return horizontalRule + paragraph :: PandocMonad m => LP m Blocks paragraph = do x <- trimInlines . mconcat <$> many1 inline @@ -1264,6 +693,16 @@ include name = do mapM_ (insertIncluded defaultExt) fs return mempty +readFileFromTexinputs :: PandocMonad m => FilePath -> LP m (Maybe Text) +readFileFromTexinputs fp = do + fileContentsMap <- sFileContents <$> getState + case M.lookup (T.pack fp) fileContentsMap of + Just t -> return (Just t) + Nothing -> do + dirs <- map T.unpack . splitTextBy (==':') . fromMaybe "." + <$> lookupEnv "TEXINPUTS" + readFileFromDirs dirs fp + insertIncluded :: PandocMonad m => FilePath -> FilePath @@ -1273,13 +712,12 @@ insertIncluded defaultExtension f' = do ".tex" -> f' ".sty" -> f' _ -> addExtension f' defaultExtension - dirs <- map T.unpack . splitTextBy (==':') . fromMaybe "." <$> lookupEnv "TEXINPUTS" pos <- getPosition containers <- getIncludeFiles <$> getState when (T.pack f `elem` containers) $ throwError $ PandocParseError $ T.pack $ "Include file loop at " ++ show pos updateState $ addIncludeFile $ T.pack f - mbcontents <- readFileFromDirs dirs f + mbcontents <- readFileFromTexinputs f contents <- case mbcontents of Just s -> return s Nothing -> do @@ -1288,10 +726,6 @@ insertIncluded defaultExtension f' = do getInput >>= setInput . (tokenize f contents ++) updateState dropLatestIncludeFile -addMeta :: PandocMonad m => ToMetaValue a => Text -> a -> LP m () -addMeta field val = updateState $ \st -> - st{ sMeta = addMetaField field val $ sMeta st } - authors :: PandocMonad m => LP m () authors = try $ do bgroup @@ -1300,150 +734,6 @@ authors = try $ do egroup addMeta "author" (map trimInlines auths) -macroDef :: (PandocMonad m, Monoid a) => (Text -> a) -> LP m a -macroDef constructor = do - (_, s) <- withRaw (commandDef <|> environmentDef) - (constructor (untokenize s) <$ - guardDisabled Ext_latex_macros) - <|> return mempty - where commandDef = do - (name, macro') <- newcommand <|> letmacro <|> defmacro - guardDisabled Ext_latex_macros <|> - updateState (\s -> s{ sMacros = M.insert name macro' (sMacros s) }) - environmentDef = do - mbenv <- newenvironment - case mbenv of - Nothing -> return () - Just (name, macro1, macro2) -> - guardDisabled Ext_latex_macros <|> - do updateState $ \s -> s{ sMacros = - M.insert name macro1 (sMacros s) } - updateState $ \s -> s{ sMacros = - M.insert ("end" <> name) macro2 (sMacros s) } - -- @\newenvironment{envname}[n-args][default]{begin}{end}@ - -- is equivalent to - -- @\newcommand{\envname}[n-args][default]{begin}@ - -- @\newcommand{\endenvname}@ - -letmacro :: PandocMonad m => LP m (Text, Macro) -letmacro = do - controlSeq "let" - (name, contents) <- withVerbatimMode $ do - Tok _ (CtrlSeq name) _ <- anyControlSeq - optional $ symbol '=' - spaces - -- we first parse in verbatim mode, and then expand macros, - -- because we don't want \let\foo\bar to turn into - -- \let\foo hello if we have previously \def\bar{hello} - contents <- bracedOrToken - return (name, contents) - contents' <- doMacros' 0 contents - return (name, Macro ExpandWhenDefined [] Nothing contents') - -defmacro :: PandocMonad m => LP m (Text, Macro) -defmacro = try $ - -- we use withVerbatimMode, because macros are to be expanded - -- at point of use, not point of definition - withVerbatimMode $ do - controlSeq "def" - Tok _ (CtrlSeq name) _ <- anyControlSeq - argspecs <- many (argspecArg <|> argspecPattern) - contents <- bracedOrToken - return (name, Macro ExpandWhenUsed argspecs Nothing contents) - -argspecArg :: PandocMonad m => LP m ArgSpec -argspecArg = do - Tok _ (Arg i) _ <- satisfyTok isArgTok - return $ ArgNum i - -argspecPattern :: PandocMonad m => LP m ArgSpec -argspecPattern = - Pattern <$> many1 (satisfyTok (\(Tok _ toktype' txt) -> - (toktype' == Symbol || toktype' == Word) && - (txt /= "{" && txt /= "\\" && txt /= "}"))) - -newcommand :: PandocMonad m => LP m (Text, Macro) -newcommand = do - pos <- getPosition - Tok _ (CtrlSeq mtype) _ <- controlSeq "newcommand" <|> - controlSeq "renewcommand" <|> - controlSeq "providecommand" <|> - controlSeq "DeclareMathOperator" <|> - controlSeq "DeclareRobustCommand" - withVerbatimMode $ do - Tok _ (CtrlSeq name) txt <- do - optional (symbol '*') - anyControlSeq <|> - (symbol '{' *> spaces *> anyControlSeq <* spaces <* symbol '}') - spaces - numargs <- option 0 $ try bracketedNum - let argspecs = map ArgNum [1..numargs] - spaces - optarg <- option Nothing $ Just <$> try bracketedToks - spaces - contents' <- bracedOrToken - let contents = - case mtype of - "DeclareMathOperator" -> - Tok pos (CtrlSeq "mathop") "\\mathop" - : Tok pos Symbol "{" - : Tok pos (CtrlSeq "mathrm") "\\mathrm" - : Tok pos Symbol "{" - : (contents' ++ - [ Tok pos Symbol "}", Tok pos Symbol "}" ]) - _ -> contents' - macros <- sMacros <$> getState - case M.lookup name macros of - Just macro - | mtype == "newcommand" -> do - report $ MacroAlreadyDefined txt pos - return (name, macro) - | mtype == "providecommand" -> return (name, macro) - _ -> return (name, Macro ExpandWhenUsed argspecs optarg contents) - -newenvironment :: PandocMonad m => LP m (Maybe (Text, Macro, Macro)) -newenvironment = do - pos <- getPosition - Tok _ (CtrlSeq mtype) _ <- controlSeq "newenvironment" <|> - controlSeq "renewenvironment" <|> - controlSeq "provideenvironment" - withVerbatimMode $ do - optional $ symbol '*' - spaces - name <- untokenize <$> braced - spaces - numargs <- option 0 $ try bracketedNum - spaces - optarg <- option Nothing $ Just <$> try bracketedToks - let argspecs = map (\i -> ArgNum i) [1..numargs] - startcontents <- spaces >> bracedOrToken - endcontents <- spaces >> bracedOrToken - macros <- sMacros <$> getState - case M.lookup name macros of - Just _ - | mtype == "newenvironment" -> do - report $ MacroAlreadyDefined name pos - return Nothing - | mtype == "provideenvironment" -> - return Nothing - _ -> return $ Just (name, - Macro ExpandWhenUsed argspecs optarg startcontents, - Macro ExpandWhenUsed [] Nothing endcontents) - -bracketedNum :: PandocMonad m => LP m Int -bracketedNum = do - ds <- untokenize <$> bracketedToks - case safeRead ds of - Just i -> return i - _ -> return 0 - -setCaption :: PandocMonad m => LP m () -setCaption = try $ do - skipopts - ils <- tok - optional $ try $ spaces *> label - updateState $ \st -> st{ sCaption = Just ils } - looseItem :: PandocMonad m => LP m Blocks looseItem = do inListItem <- sInListItem <$> getState @@ -1457,10 +747,6 @@ epigraph = do p2 <- grouped block return $ divWith ("", ["epigraph"], []) (p1 <> p2) -resetCaption :: PandocMonad m => LP m () -resetCaption = updateState $ \st -> st{ sCaption = Nothing - , sLastLabel = Nothing } - section :: PandocMonad m => Attr -> Int -> LP m Blocks section (ident, classes, kvs) lvl = do skipopts @@ -1554,7 +840,7 @@ blockCommands = M.fromList , ("address", mempty <$ (skipopts *> tok >>= addMeta "address")) , ("signature", mempty <$ (skipopts *> authors)) , ("date", mempty <$ (skipopts *> tok >>= addMeta "date")) - , ("newtheorem", newtheorem) + , ("newtheorem", newtheorem inline) , ("theoremstyle", theoremstyle) -- KOMA-Script metadata commands , ("extratitle", mempty <$ (skipopts *> tok >>= addMeta "extratitle")) @@ -1598,11 +884,11 @@ blockCommands = M.fromList -- , ("hrule", pure horizontalRule) , ("strut", pure mempty) - , ("rule", skipopts *> tok *> tok $> horizontalRule) + , ("rule", rule) , ("item", looseItem) , ("documentclass", skipopts *> braced *> preamble) , ("centerline", para . trimInlines <$> (skipopts *> tok)) - , ("caption", mempty <$ setCaption) + , ("caption", mempty <$ setCaption inline) , ("bibliography", mempty <$ (skipopts *> braced >>= addMeta "bibliography" . splitBibs . untokenize)) , ("addbibresource", mempty <$ (skipopts *> braced >>= @@ -1640,7 +926,8 @@ blockCommands = M.fromList environments :: PandocMonad m => M.Map Text (LP m Blocks) -environments = M.fromList +environments = M.union (tableEnvironments blocks inline) $ + M.fromList [ ("document", env "document" blocks <* skipMany anyTok) , ("abstract", mempty <$ (env "abstract" blocks >>= addMeta "abstract")) , ("sloppypar", env "sloppypar" blocks) @@ -1654,13 +941,6 @@ environments = M.fromList , ("flushright", divWith ("", ["flushright"], []) <$> env "flushright" blocks) , ("flushleft", divWith ("", ["flushleft"], []) <$> env "flushleft" blocks) , ("landscape", env "landscape" blocks) - , ("longtable", env "longtable" $ - resetCaption *> simpTable "longtable" False >>= addTableCaption) - , ("table", env "table" $ - skipopts *> resetCaption *> blocks >>= addTableCaption) - , ("tabular*", env "tabular*" $ simpTable "tabular*" True) - , ("tabularx", env "tabularx" $ simpTable "tabularx" True) - , ("tabular", env "tabular" $ simpTable "tabular" False) , ("quote", blockQuote <$> env "quote" blocks) , ("quotation", blockQuote <$> env "quotation" blocks) , ("verse", blockQuote <$> env "verse" blocks) @@ -1683,7 +963,7 @@ environments = M.fromList , ("lilypond", rawVerbEnv "lilypond") , ("ly", rawVerbEnv "ly") -- amsthm - , ("proof", proof) + , ("proof", proof blocks opt) -- etoolbox , ("ifstrequal", ifstrequal) , ("newtoggle", braced >>= newToggle) @@ -1692,130 +972,29 @@ environments = M.fromList , ("iftoggle", try $ ifToggle >> block) ] -theoremstyle :: PandocMonad m => LP m Blocks -theoremstyle = do - stylename <- untokenize <$> braced - let mbstyle = case stylename of - "plain" -> Just PlainStyle - "definition" -> Just DefinitionStyle - "remark" -> Just RemarkStyle - _ -> Nothing - case mbstyle of - Nothing -> return () - Just sty -> updateState $ \s -> s{ sLastTheoremStyle = sty } - return mempty - -newtheorem :: PandocMonad m => LP m Blocks -newtheorem = do - number <- option True (False <$ symbol '*' <* sp) +filecontents :: PandocMonad m => LP m Blocks +filecontents = try $ do + controlSeq "begin" name <- untokenize <$> braced - sp - series <- option Nothing $ Just . untokenize <$> bracketedToks - sp - showName <- tok - sp - syncTo <- option Nothing $ Just . untokenize <$> bracketedToks - sty <- sLastTheoremStyle <$> getState - let spec = TheoremSpec { theoremName = showName - , theoremStyle = sty - , theoremSeries = series - , theoremSyncTo = syncTo - , theoremNumber = number - , theoremLastNum = DottedNum [0] } - tmap <- sTheoremMap <$> getState - updateState $ \s -> s{ sTheoremMap = - M.insert name spec tmap } + guard $ name == "filecontents" || name == "filecontents*" + skipopts + fp <- untokenize <$> braced + txt <- verbEnv name + updateState $ \st -> + st{ sFileContents = M.insert fp txt (sFileContents st) } return mempty -proof :: PandocMonad m => LP m Blocks -proof = do - title <- option (B.text "Proof") opt - bs <- env "proof" blocks - return $ - B.divWith ("", ["proof"], []) $ - addQed $ addTitle (B.emph (title <> ".")) bs - -addTitle :: Inlines -> Blocks -> Blocks -addTitle ils bs = - case B.toList bs of - (Para xs : rest) - -> B.fromList (Para (B.toList ils ++ (Space : xs)) : rest) - _ -> B.para ils <> bs - -addQed :: Blocks -> Blocks -addQed bs = - case Seq.viewr (B.unMany bs) of - s Seq.:> Para ils - -> B.Many (s Seq.|> Para (ils ++ B.toList qedSign)) - _ -> bs <> B.para qedSign - where - qedSign = B.str "\xa0\x25FB" - environment :: PandocMonad m => LP m Blocks environment = try $ do controlSeq "begin" name <- untokenize <$> braced M.findWithDefault mzero name environments <|> - theoremEnvironment name <|> + theoremEnvironment blocks opt name <|> if M.member name (inlineEnvironments :: M.Map Text (LP PandocPure Inlines)) then mzero else try (rawEnv name) <|> rawVerbEnv name -theoremEnvironment :: PandocMonad m => Text -> LP m Blocks -theoremEnvironment name = do - tmap <- sTheoremMap <$> getState - case M.lookup name tmap of - Nothing -> mzero - Just tspec -> do - optTitle <- option mempty $ (\x -> space <> "(" <> x <> ")") <$> opt - mblabel <- option Nothing $ Just . untokenize <$> - try (spaces >> controlSeq "label" >> spaces >> braced) - bs <- env name blocks - number <- - if theoremNumber tspec - then do - let name' = fromMaybe name $ theoremSeries tspec - num <- getNextNumber - (maybe (DottedNum [0]) theoremLastNum . - M.lookup name' . sTheoremMap) - updateState $ \s -> - s{ sTheoremMap = - M.adjust - (\spec -> spec{ theoremLastNum = num }) - name' - (sTheoremMap s) - } - - case mblabel of - Just ident -> - updateState $ \s -> - s{ sLabels = M.insert ident - (B.toList $ - theoremName tspec <> "\160" <> - str (renderDottedNum num)) (sLabels s) } - Nothing -> return () - return $ space <> B.text (renderDottedNum num) - else return mempty - let titleEmph = case theoremStyle tspec of - PlainStyle -> B.strong - DefinitionStyle -> B.strong - RemarkStyle -> B.emph - let title = titleEmph (theoremName tspec <> number) - <> optTitle <> "." <> space - return $ divWith (fromMaybe "" mblabel, [name], []) $ addTitle title - $ case theoremStyle tspec of - PlainStyle -> walk italicize bs - _ -> bs - -italicize :: Block -> Block -italicize (Para ils) = Para [Emph ils] -italicize (Plain ils) = Plain [Emph ils] -italicize x = x - -env :: PandocMonad m => Text -> LP m a -> LP m a -env name p = p <* end_ name - rawEnv :: PandocMonad m => Text -> LP m Blocks rawEnv name = do exts <- getOption readerExtensions @@ -1823,15 +1002,17 @@ rawEnv name = do rawOptions <- mconcat <$> many rawopt let beginCommand = "\\begin{" <> name <> "}" <> rawOptions pos1 <- getPosition - (bs, raw) <- withRaw $ env name blocks if parseRaw - then return $ rawBlock "latex" + then do + (_, raw) <- withRaw $ env name blocks + return $ rawBlock "latex" $ beginCommand <> untokenize raw else do + bs <- env name blocks report $ SkippedContent beginCommand pos1 pos2 <- getPosition report $ SkippedContent ("\\end{" <> name <> "}") pos2 - return bs + return $ divWith ("",[name],[]) bs rawVerbEnv :: PandocMonad m => Text -> LP m Blocks rawVerbEnv name = do @@ -1890,8 +1071,7 @@ inputMinted = do pos <- getPosition attr <- mintedAttr f <- T.filter (/='"') . untokenize <$> braced - dirs <- map T.unpack . splitTextBy (==':') . fromMaybe "." <$> lookupEnv "TEXINPUTS" - mbCode <- readFileFromDirs dirs (T.unpack f) + mbCode <- readFileFromTexinputs (T.unpack f) rawcode <- case mbCode of Just s -> return s Nothing -> do @@ -1989,8 +1169,7 @@ inputListing = do pos <- getPosition options <- option [] keyvals f <- T.filter (/='"') . untokenize <$> braced - dirs <- map T.unpack . splitTextBy (==':') . fromMaybe "." <$> lookupEnv "TEXINPUTS" - mbCode <- readFileFromDirs dirs (T.unpack f) + mbCode <- readFileFromTexinputs (T.unpack f) codeLines <- case mbCode of Just s -> return $ T.lines s Nothing -> do @@ -1999,7 +1178,8 @@ inputListing = do let (ident,classes,kvs) = parseListingsOptions options let classes' = (case listingsLanguage options of - Nothing -> (take 1 (languagesByExtension (T.pack $ takeExtension $ T.unpack f)) <>) + Nothing -> (take 1 (languagesByExtension defaultSyntaxMap + (T.pack $ takeExtension $ T.unpack f)) <>) Just _ -> id) classes let firstline = fromMaybe 1 $ lookup "firstline" options >>= safeRead let lastline = fromMaybe (length codeLines) $ @@ -2065,358 +1245,23 @@ orderedList' = try $ do bs <- listenv "enumerate" (many item) return $ orderedListWith (start, style, delim) bs --- tables - -hline :: PandocMonad m => LP m () -hline = try $ do - spaces - controlSeq "hline" <|> - -- booktabs rules: - controlSeq "toprule" <|> - controlSeq "bottomrule" <|> - controlSeq "midrule" <|> - controlSeq "endhead" <|> - controlSeq "endfirsthead" - spaces - optional opt - return () - -lbreak :: PandocMonad m => LP m Tok -lbreak = (controlSeq "\\" <|> controlSeq "tabularnewline") - <* skipopts <* spaces - -amp :: PandocMonad m => LP m Tok -amp = symbol '&' - --- Split a Word into individual Symbols (for parseAligns) -splitWordTok :: PandocMonad m => LP m () -splitWordTok = do - inp <- getInput - case inp of - (Tok spos Word t : rest) -> - setInput $ map (Tok spos Symbol . T.singleton) (T.unpack t) <> rest - _ -> return () - -parseAligns :: PandocMonad m => LP m [(Alignment, ColWidth, ([Tok], [Tok]))] -parseAligns = try $ do - let maybeBar = skipMany - (try $ sp *> (() <$ symbol '|' <|> () <$ (symbol '@' >> braced))) - let cAlign = AlignCenter <$ symbol 'c' - let lAlign = AlignLeft <$ symbol 'l' - let rAlign = AlignRight <$ symbol 'r' - let parAlign = AlignLeft <$ symbol 'p' - -- aligns from tabularx - let xAlign = AlignLeft <$ symbol 'X' - let mAlign = AlignLeft <$ symbol 'm' - let bAlign = AlignLeft <$ symbol 'b' - let alignChar = splitWordTok *> ( cAlign <|> lAlign <|> rAlign <|> parAlign - <|> xAlign <|> mAlign <|> bAlign ) - let alignPrefix = symbol '>' >> braced - let alignSuffix = symbol '<' >> braced - let colWidth = try $ do - symbol '{' - ds <- trim . untokenize <$> manyTill anyTok (controlSeq "linewidth") - spaces - symbol '}' - return $ safeRead ds - let alignSpec = do - pref <- option [] alignPrefix - spaces - al <- alignChar - width <- colWidth <|> option Nothing (do s <- untokenize <$> braced - pos <- getPosition - report $ SkippedContent s pos - return Nothing) - spaces - suff <- option [] alignSuffix - return (al, width, (pref, suff)) - let starAlign = do -- '*{2}{r}' == 'rr', we just expand like a macro - symbol '*' - spaces - ds <- trim . untokenize <$> braced - spaces - spec <- braced - case safeRead ds of - Just n -> - getInput >>= setInput . (mconcat (replicate n spec) ++) - Nothing -> Prelude.fail $ "Could not parse " <> T.unpack ds <> " as number" - bgroup - spaces - maybeBar - aligns' <- many $ try $ spaces >> optional starAlign >> - (alignSpec <* maybeBar) - spaces - egroup - spaces - return $ map toSpec aligns' - where - toColWidth (Just w) | w > 0 = ColWidth w - toColWidth _ = ColWidthDefault - toSpec (x, y, z) = (x, toColWidth y, z) - --- N.B. this parser returns a Row that may have erroneous empty cells --- in it. See the note above fixTableHead for details. -parseTableRow :: PandocMonad m - => Text -- ^ table environment name - -> [([Tok], [Tok])] -- ^ pref/suffixes - -> LP m Row -parseTableRow envname prefsufs = do - notFollowedBy (spaces *> end_ envname) - -- add prefixes and suffixes in token stream: - let celltoks (pref, suff) = do - prefpos <- getPosition - contents <- mconcat <$> - many ( snd <$> withRaw (controlSeq "parbox" >> parbox) -- #5711 - <|> - snd <$> withRaw (inlineEnvironment <|> dollarsMath) - <|> - (do notFollowedBy - (() <$ amp <|> () <$ lbreak <|> end_ envname) - count 1 anyTok) ) - - suffpos <- getPosition - option [] (count 1 amp) - return $ map (setpos prefpos) pref ++ contents ++ map (setpos suffpos) suff - rawcells <- mapM celltoks prefsufs - oldInput <- getInput - cells <- mapM (\ts -> setInput ts >> parseTableCell) rawcells - setInput oldInput - spaces - return $ Row nullAttr cells - -parseTableCell :: PandocMonad m => LP m Cell -parseTableCell = do - spaces - updateState $ \st -> st{ sInTableCell = True } - cell' <- multicolumnCell - <|> multirowCell - <|> parseSimpleCell - <|> parseEmptyCell - updateState $ \st -> st{ sInTableCell = False } - spaces - return cell' - where - -- The parsing of empty cells is important in LaTeX, especially when dealing - -- with multirow/multicolumn. See #6603. - parseEmptyCell = spaces $> emptyCell - -cellAlignment :: PandocMonad m => LP m Alignment -cellAlignment = skipMany (symbol '|') *> alignment <* skipMany (symbol '|') - where - alignment = do - c <- untoken <$> singleChar - return $ case c of - "l" -> AlignLeft - "r" -> AlignRight - "c" -> AlignCenter - "*" -> AlignDefault - _ -> AlignDefault - -plainify :: Blocks -> Blocks -plainify bs = case toList bs of - [Para ils] -> plain (fromList ils) - _ -> bs - -multirowCell :: PandocMonad m => LP m Cell -multirowCell = controlSeq "multirow" >> do - -- Full prototype for \multirow macro is: - -- \multirow[vpos]{nrows}[bigstruts]{width}[vmove]{text} - -- However, everything except `nrows` and `text` make - -- sense in the context of the Pandoc AST - _ <- optional $ symbol '[' *> cellAlignment <* symbol ']' -- vertical position - nrows <- fmap (fromMaybe 1 . safeRead . untokenize) braced - _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']') -- bigstrut-related - _ <- symbol '{' *> manyTill anyTok (symbol '}') -- Cell width - _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']') -- Length used for fine-tuning - content <- symbol '{' *> (plainify <$> blocks) <* symbol '}' - return $ cell AlignDefault (RowSpan nrows) (ColSpan 1) content - -multicolumnCell :: PandocMonad m => LP m Cell -multicolumnCell = controlSeq "multicolumn" >> do - span' <- fmap (fromMaybe 1 . safeRead . untokenize) braced - alignment <- symbol '{' *> cellAlignment <* symbol '}' - - let singleCell = do - content <- plainify <$> blocks - return $ cell alignment (RowSpan 1) (ColSpan span') content - - -- Two possible contents: either a \multirow cell, or content. - -- E.g. \multicol{1}{c}{\multirow{2}{1em}{content}} - -- Note that a \multirow cell can be nested in a \multicolumn, - -- but not the other way around. See #6603 - let nestedCell = do - (Cell _ _ (RowSpan rs) _ bs) <- multirowCell - return $ cell - alignment - (RowSpan rs) - (ColSpan span') - (fromList bs) - - symbol '{' *> (nestedCell <|> singleCell) <* symbol '}' - --- Parse a simple cell, i.e. not multirow/multicol -parseSimpleCell :: PandocMonad m => LP m Cell -parseSimpleCell = simpleCell <$> (plainify <$> blocks) - --- LaTeX tables are stored with empty cells underneath multirow cells --- denoting the grid spaces taken up by them. More specifically, if a --- cell spans m rows, then it will overwrite all the cells in the --- columns it spans for (m-1) rows underneath it, requiring padding --- cells in these places. These padding cells need to be removed for --- proper table reading. See #6603. --- --- These fixTable functions do not otherwise fix up malformed --- input tables: that is left to the table builder. -fixTableHead :: TableHead -> TableHead -fixTableHead (TableHead attr rows) = TableHead attr rows' - where - rows' = fixTableRows rows - -fixTableBody :: TableBody -> TableBody -fixTableBody (TableBody attr rhc th tb) - = TableBody attr rhc th' tb' - where - th' = fixTableRows th - tb' = fixTableRows tb - -fixTableRows :: [Row] -> [Row] -fixTableRows = fixTableRows' $ repeat Nothing - where - fixTableRows' oldHang (Row attr cells : rs) - = let (newHang, cells') = fixTableRow oldHang cells - rs' = fixTableRows' newHang rs - in Row attr cells' : rs' - fixTableRows' _ [] = [] - --- The overhang is represented as Just (relative cell dimensions) or --- Nothing for an empty grid space. -fixTableRow :: [Maybe (ColSpan, RowSpan)] -> [Cell] -> ([Maybe (ColSpan, RowSpan)], [Cell]) -fixTableRow oldHang cells - -- If there's overhang, drop cells until their total width meets the - -- width of the occupied grid spaces (or we run out) - | (n, prefHang, restHang) <- splitHang oldHang - , n > 0 - = let cells' = dropToWidth getCellW n cells - (restHang', cells'') = fixTableRow restHang cells' - in (prefHang restHang', cells'') - -- Otherwise record the overhang of a pending cell and fix the rest - -- of the row - | c@(Cell _ _ h w _):cells' <- cells - = let h' = max 1 h - w' = max 1 w - oldHang' = dropToWidth getHangW w' oldHang - (newHang, cells'') = fixTableRow oldHang' cells' - in (toHang w' h' <> newHang, c : cells'') - | otherwise - = (oldHang, []) - where - getCellW (Cell _ _ _ w _) = w - getHangW = maybe 1 fst - getCS (ColSpan n) = n - - toHang c r - | r > 1 = [Just (c, r)] - | otherwise = replicate (getCS c) Nothing - - -- Take the prefix of the overhang list representing filled grid - -- spaces. Also return the remainder and the length of this prefix. - splitHang = splitHang' 0 id - - splitHang' !n l (Just (c, r):xs) - = splitHang' (n + c) (l . (toHang c (r-1) ++)) xs - splitHang' n l xs = (n, l, xs) - - -- Drop list items until the total width of the dropped items - -- exceeds the passed width. - dropToWidth _ n l | n < 1 = l - dropToWidth wproj n (c:cs) = dropToWidth wproj (n - wproj c) cs - dropToWidth _ _ [] = [] - -simpTable :: PandocMonad m => Text -> Bool -> LP m Blocks -simpTable envname hasWidthParameter = try $ do - when hasWidthParameter $ () <$ (spaces >> tok) - skipopts - colspecs <- parseAligns - let (aligns, widths, prefsufs) = unzip3 colspecs - optional $ controlSeq "caption" *> setCaption - spaces - optional label - spaces - optional lbreak - spaces - skipMany hline - spaces - header' <- option [] . try . fmap (:[]) $ - parseTableRow envname prefsufs <* lbreak <* many1 hline - spaces - rows <- sepEndBy (parseTableRow envname prefsufs) - (lbreak <* optional (skipMany hline)) - spaces - optional $ controlSeq "caption" *> setCaption - spaces - optional label - spaces - optional lbreak - spaces - lookAhead $ controlSeq "end" -- make sure we're at end - let th = fixTableHead $ TableHead nullAttr header' - let tbs = [fixTableBody $ TableBody nullAttr 0 [] rows] - let tf = TableFoot nullAttr [] - return $ table emptyCaption (zip aligns widths) th tbs tf - -addTableCaption :: PandocMonad m => Blocks -> LP m Blocks -addTableCaption = walkM go - where go (Table attr c spec th tb tf) = do - st <- getState - let mblabel = sLastLabel st - capt <- case (sCaption st, mblabel) of - (Just ils, Nothing) -> return $ caption Nothing (plain ils) - (Just ils, Just lab) -> do - num <- getNextNumber sLastTableNum - setState - st{ sLastTableNum = num - , sLabels = M.insert lab - [Str (renderDottedNum num)] - (sLabels st) } - return $ caption Nothing (plain ils) -- add number?? - (Nothing, _) -> return c - let attr' = case (attr, mblabel) of - ((_,classes,kvs), Just ident) -> - (ident,classes,kvs) - _ -> attr - return $ addAttrDiv attr' $ Table nullAttr capt spec th tb tf - go x = return x - --- TODO: For now we add a Div to contain table attributes, since --- most writers don't do anything yet with attributes on Table. --- This can be removed when that changes. -addAttrDiv :: Attr -> Block -> Block -addAttrDiv ("",[],[]) b = b -addAttrDiv attr b = Div attr [b] - block :: PandocMonad m => LP m Blocks block = do - res <- (mempty <$ spaces1) - <|> environment - <|> macroDef (rawBlock "latex") - <|> blockCommand - <|> paragraph - <|> grouped block + Tok _ toktype _ <- lookAhead anyTok + res <- (case toktype of + Newline -> mempty <$ spaces1 + Spaces -> mempty <$ spaces1 + Comment -> mempty <$ spaces1 + Word -> paragraph + CtrlSeq "begin" -> environment + CtrlSeq _ -> macroDef (rawBlock "latex") + <|> blockCommand + _ -> mzero) + <|> paragraph + <|> grouped block trace (T.take 60 $ tshow $ B.toList res) return res blocks :: PandocMonad m => LP m Blocks blocks = mconcat <$> many block -setDefaultLanguage :: PandocMonad m => LP m Blocks -setDefaultLanguage = do - o <- option "" $ T.filter (\c -> c /= '[' && c /= ']') - <$> rawopt - polylang <- untokenize <$> braced - case M.lookup polylang polyglossiaLangToBCP47 of - Nothing -> return mempty -- TODO mzero? warning? - Just langFunc -> do - let l = langFunc o - setTranslations l - updateState $ setMeta "lang" $ str (renderLang l) - return mempty diff --git a/src/Text/Pandoc/Readers/LaTeX/Citation.hs b/src/Text/Pandoc/Readers/LaTeX/Citation.hs new file mode 100644 index 000000000..af97125c6 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Citation.hs @@ -0,0 +1,210 @@ +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE OverloadedStrings #-} +module Text.Pandoc.Readers.LaTeX.Citation + ( citationCommands + , cites + ) +where + +import Text.Pandoc.Class +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Builder as B +import qualified Data.Map as M +import Data.Text (Text) +import Control.Applicative ((<|>), optional, many) +import Control.Monad (mzero) +import Control.Monad.Trans (lift) +import Control.Monad.Except (throwError) +import Text.Pandoc.Error (PandocError(PandocParsecError)) +import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline, + optional, space, spaces, withRaw, (<|>)) + +citationCommands :: PandocMonad m => LP m Inlines -> M.Map Text (LP m Inlines) +citationCommands inline = + let citation = citationWith inline + tok = spaces *> grouped inline + in M.fromList + [ ("cite", citation "cite" NormalCitation False) + , ("Cite", citation "Cite" NormalCitation False) + , ("citep", citation "citep" NormalCitation False) + , ("citep*", citation "citep*" NormalCitation False) + , ("citeal", citation "citeal" NormalCitation False) + , ("citealp", citation "citealp" NormalCitation False) + , ("citealp*", citation "citealp*" NormalCitation False) + , ("autocite", citation "autocite" NormalCitation False) + , ("smartcite", citation "smartcite" NormalCitation False) + , ("footcite", inNote <$> citation "footcite" NormalCitation False) + , ("parencite", citation "parencite" NormalCitation False) + , ("supercite", citation "supercite" NormalCitation False) + , ("footcitetext", inNote <$> citation "footcitetext" NormalCitation False) + , ("citeyearpar", citation "citeyearpar" SuppressAuthor False) + , ("citeyear", citation "citeyear" SuppressAuthor False) + , ("autocite*", citation "autocite*" SuppressAuthor False) + , ("cite*", citation "cite*" SuppressAuthor False) + , ("parencite*", citation "parencite*" SuppressAuthor False) + , ("textcite", citation "textcite" AuthorInText False) + , ("citet", citation "citet" AuthorInText False) + , ("citet*", citation "citet*" AuthorInText False) + , ("citealt", citation "citealt" AuthorInText False) + , ("citealt*", citation "citealt*" AuthorInText False) + , ("textcites", citation "textcites" AuthorInText True) + , ("cites", citation "cites" NormalCitation True) + , ("autocites", citation "autocites" NormalCitation True) + , ("footcites", inNote <$> citation "footcites" NormalCitation True) + , ("parencites", citation "parencites" NormalCitation True) + , ("supercites", citation "supercites" NormalCitation True) + , ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True) + , ("Autocite", citation "Autocite" NormalCitation False) + , ("Smartcite", citation "Smartcite" NormalCitation False) + , ("Footcite", inNote <$> citation "Footcite" NormalCitation False) + , ("Parencite", citation "Parencite" NormalCitation False) + , ("Supercite", citation "Supercite" NormalCitation False) + , ("Footcitetext", inNote <$> citation "Footcitetext" NormalCitation False) + , ("Citeyearpar", citation "Citeyearpar" SuppressAuthor False) + , ("Citeyear", citation "Citeyear" SuppressAuthor False) + , ("Autocite*", citation "Autocite*" SuppressAuthor False) + , ("Cite*", citation "Cite*" SuppressAuthor False) + , ("Parencite*", citation "Parencite*" SuppressAuthor False) + , ("Textcite", citation "Textcite" AuthorInText False) + , ("Textcites", citation "Textcites" AuthorInText True) + , ("Cites", citation "Cites" NormalCitation True) + , ("Autocites", citation "Autocites" NormalCitation True) + , ("Footcites", inNote <$> citation "Footcites" NormalCitation True) + , ("Parencites", citation "Parencites" NormalCitation True) + , ("Supercites", citation "Supercites" NormalCitation True) + , ("Footcitetexts", inNote <$> citation "Footcitetexts" NormalCitation True) + , ("citetext", complexNatbibCitation inline NormalCitation) + , ("citeauthor", (try (tok *> sp *> controlSeq "citetext") *> + complexNatbibCitation inline AuthorInText) + <|> citation "citeauthor" AuthorInText False) + , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>= + addMeta "nocite")) + ] + +-- citations + +addPrefix :: [Inline] -> [Citation] -> [Citation] +addPrefix p (k:ks) = k {citationPrefix = p ++ citationPrefix k} : ks +addPrefix _ _ = [] + +addSuffix :: [Inline] -> [Citation] -> [Citation] +addSuffix s ks@(_:_) = + let k = last ks + in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] +addSuffix _ _ = [] + +simpleCiteArgs :: forall m . PandocMonad m => LP m Inlines -> LP m [Citation] +simpleCiteArgs inline = try $ do + first <- optionMaybe $ toList <$> opt + second <- optionMaybe $ toList <$> opt + keys <- try $ bgroup *> manyTill citationLabel egroup + let (pre, suf) = case (first , second ) of + (Just s , Nothing) -> (mempty, s ) + (Just s , Just t ) -> (s , t ) + _ -> (mempty, mempty) + conv k = Citation { citationId = k + , citationPrefix = [] + , citationSuffix = [] + , citationMode = NormalCitation + , citationHash = 0 + , citationNoteNum = 0 + } + return $ addPrefix pre $ addSuffix suf $ map conv keys + where + opt :: PandocMonad m => LP m Inlines + opt = do + toks <- try (sp *> bracketedToks <* sp) + -- now parse the toks as inlines + st <- getState + parsed <- lift $ + runParserT (mconcat <$> many inline) st "bracketed option" toks + case parsed of + Right result -> return result + Left e -> throwError $ PandocParsecError (toSources toks) e + + + +citationLabel :: PandocMonad m => LP m Text +citationLabel = do + sp + untokenize <$> + (many1 (satisfyTok isWordTok <|> symbolIn bibtexKeyChar) + <* sp + <* optional (symbol ',') + <* sp) + where bibtexKeyChar = ".:;?!`'()/*@_+=-&[]" :: [Char] + +cites :: PandocMonad m + => LP m Inlines -> CitationMode -> Bool -> LP m [Citation] +cites inline mode multi = try $ do + let paropt = parenWrapped inline + cits <- if multi + then do + multiprenote <- optionMaybe $ toList <$> paropt + multipostnote <- optionMaybe $ toList <$> paropt + let (pre, suf) = case (multiprenote, multipostnote) of + (Just s , Nothing) -> (mempty, s) + (Nothing , Just t) -> (mempty, t) + (Just s , Just t ) -> (s, t) + _ -> (mempty, mempty) + tempCits <- many1 $ simpleCiteArgs inline + case tempCits of + (k:ks) -> case ks of + (_:_) -> return $ (addMprenote pre k : init ks) ++ + [addMpostnote suf (last ks)] + _ -> return [addMprenote pre (addMpostnote suf k)] + _ -> return [[]] + else count 1 $ simpleCiteArgs inline + let cs = concat cits + return $ case mode of + AuthorInText -> case cs of + (c:rest) -> c {citationMode = mode} : rest + [] -> [] + _ -> map (\a -> a {citationMode = mode}) cs + where mprenote (k:ks) = (k:ks) ++ [Space] + mprenote _ = mempty + mpostnote (k:ks) = [Str ",", Space] ++ (k:ks) + mpostnote _ = mempty + addMprenote mpn (k:ks) = + let mpnfinal = case citationPrefix k of + (_:_) -> mprenote mpn + _ -> mpn + in addPrefix mpnfinal (k:ks) + addMprenote _ _ = [] + addMpostnote = addSuffix . mpostnote + +citationWith :: PandocMonad m + => LP m Inlines -> Text -> CitationMode -> Bool -> LP m Inlines +citationWith inline name mode multi = do + (c,raw) <- withRaw $ cites inline mode multi + return $ cite c (rawInline "latex" $ "\\" <> name <> untokenize raw) + +handleCitationPart :: Inlines -> [Citation] +handleCitationPart ils = + let isCite Cite{} = True + isCite _ = False + (pref, rest) = break isCite (toList ils) + in case rest of + (Cite cs _:suff) -> addPrefix pref $ addSuffix suff cs + _ -> [] + +complexNatbibCitation :: PandocMonad m + => LP m Inlines -> CitationMode -> LP m Inlines +complexNatbibCitation inline mode = try $ do + (cs, raw) <- + withRaw $ concat <$> do + bgroup + items <- mconcat <$> + many1 (notFollowedBy (symbol ';') >> inline) + `sepBy1` symbol ';' + egroup + return $ map handleCitationPart items + case cs of + [] -> mzero + (c:cits) -> return $ cite (c{ citationMode = mode }:cits) + (rawInline "latex" $ "\\citetext" <> untokenize raw) + +inNote :: Inlines -> Inlines +inNote ils = + note $ para $ ils <> str "." + diff --git a/src/Text/Pandoc/Readers/LaTeX/Inline.hs b/src/Text/Pandoc/Readers/LaTeX/Inline.hs new file mode 100644 index 000000000..7b8bca4af --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Inline.hs @@ -0,0 +1,397 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ViewPatterns #-} +{- | + Module : Text.Pandoc.Readers.LaTeX.Inline + Copyright : Copyright (C) 2006-2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable +-} +module Text.Pandoc.Readers.LaTeX.Inline + ( acronymCommands + , verbCommands + , charCommands + , accentCommands + , nameCommands + , biblatexInlineCommands + , refCommands + , rawInlineOr + , listingsLanguage + ) +where + +import qualified Data.Map as M +import Data.Text (Text) +import qualified Data.Text as T +import Text.Pandoc.Builder +import Text.Pandoc.Shared (toRomanNumeral, safeRead) +import Text.Pandoc.Readers.LaTeX.Types (Tok (..), TokType (..)) +import Control.Applicative (optional, (<|>)) +import Control.Monad (guard, mzero, mplus, unless) +import Text.Pandoc.Class.PandocMonad (PandocMonad (..), translateTerm) +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Extensions (extensionEnabled, Extension(..)) +import Text.Pandoc.Parsing (getOption, updateState, getState, notFollowedBy, + manyTill, getInput, setInput, incSourceColumn, + option, many1, try) +import Data.Char (isDigit) +import Text.Pandoc.Highlighting (fromListingsLanguage,) +import Data.Maybe (maybeToList, fromMaybe) +import Text.Pandoc.Options (ReaderOptions(..)) +import qualified Data.Text.Normalize as Normalize +import qualified Text.Pandoc.Translations as Translations + +rawInlineOr :: PandocMonad m => Text -> LP m Inlines -> LP m Inlines +rawInlineOr name' fallback = do + parseRaw <- extensionEnabled Ext_raw_tex <$> getOption readerExtensions + if parseRaw + then rawInline "latex" <$> getRawCommand name' ("\\" <> name') + else fallback + +dolabel :: PandocMonad m => LP m Inlines +dolabel = do + v <- braced + let refstr = untokenize v + updateState $ \st -> + st{ sLastLabel = Just refstr } + return $ spanWith (refstr,[],[("label", refstr)]) + $ inBrackets $ str $ untokenize v + +doref :: PandocMonad m => Text -> LP m Inlines +doref cls = do + v <- braced + let refstr = untokenize v + return $ linkWith ("",[],[ ("reference-type", cls) + , ("reference", refstr)]) + ("#" <> refstr) + "" + (inBrackets $ str refstr) + +inBrackets :: Inlines -> Inlines +inBrackets x = str "[" <> x <> str "]" + +doTerm :: PandocMonad m => Translations.Term -> LP m Inlines +doTerm term = str <$> translateTerm term + +lit :: Text -> LP m Inlines +lit = pure . str + +doverb :: PandocMonad m => LP m Inlines +doverb = do + Tok _ Symbol t <- anySymbol + marker <- case T.uncons t of + Just (c, ts) | T.null ts -> return c + _ -> mzero + withVerbatimMode $ + code . untokenize <$> + manyTill (notFollowedBy newlineTok >> verbTok marker) (symbol marker) + +verbTok :: PandocMonad m => Char -> LP m Tok +verbTok stopchar = do + t@(Tok pos toktype txt) <- anyTok + case T.findIndex (== stopchar) txt of + Nothing -> return t + Just i -> do + let (t1, t2) = T.splitAt i txt + inp <- getInput + setInput $ Tok (incSourceColumn pos i) Symbol (T.singleton stopchar) + : totoks (incSourceColumn pos (i + 1)) (T.drop 1 t2) ++ inp + return $ Tok pos toktype t1 + +listingsLanguage :: [(Text, Text)] -> Maybe Text +listingsLanguage opts = + case lookup "language" opts of + Nothing -> Nothing + Just l -> fromListingsLanguage l `mplus` Just l + +dolstinline :: PandocMonad m => LP m Inlines +dolstinline = do + options <- option [] keyvals + let classes = maybeToList $ listingsLanguage options + doinlinecode classes + +domintinline :: PandocMonad m => LP m Inlines +domintinline = do + skipopts + cls <- untokenize <$> braced + doinlinecode [cls] + +doinlinecode :: PandocMonad m => [Text] -> LP m Inlines +doinlinecode classes = do + Tok _ Symbol t <- anySymbol + marker <- case T.uncons t of + Just (c, ts) | T.null ts -> return c + _ -> mzero + let stopchar = if marker == '{' then '}' else marker + withVerbatimMode $ + codeWith ("",classes,[]) . T.map nlToSpace . untokenize <$> + manyTill (verbTok stopchar) (symbol stopchar) + +nlToSpace :: Char -> Char +nlToSpace '\n' = ' ' +nlToSpace x = x + +romanNumeralUpper :: (PandocMonad m) => LP m Inlines +romanNumeralUpper = + str . toRomanNumeral <$> romanNumeralArg + +romanNumeralLower :: (PandocMonad m) => LP m Inlines +romanNumeralLower = + str . T.toLower . toRomanNumeral <$> romanNumeralArg + +romanNumeralArg :: (PandocMonad m) => LP m Int +romanNumeralArg = spaces *> (parser <|> inBraces) + where + inBraces = do + symbol '{' + spaces + res <- parser + spaces + symbol '}' + return res + parser = do + s <- untokenize <$> many1 (satisfyTok isWordTok) + let (digits, rest) = T.span isDigit s + unless (T.null rest) $ + Prelude.fail "Non-digits in argument to \\Rn or \\RN" + safeRead digits + +accentWith :: PandocMonad m + => LP m Inlines -> Char -> Maybe Char -> LP m Inlines +accentWith tok combiningAccent fallBack = try $ do + ils <- tok + case toList ils of + (Str (T.uncons -> Just (x, xs)) : ys) -> return $ fromList $ + -- try to normalize to the combined character: + Str (Normalize.normalize Normalize.NFC + (T.pack [x, combiningAccent]) <> xs) : ys + [Space] -> return $ str $ T.singleton + $ fromMaybe combiningAccent fallBack + [] -> return $ str $ T.singleton + $ fromMaybe combiningAccent fallBack + _ -> return ils + + +verbCommands :: PandocMonad m => M.Map Text (LP m Inlines) +verbCommands = M.fromList + [ ("verb", doverb) + , ("lstinline", dolstinline) + , ("mintinline", domintinline) + , ("Verb", doverb) + ] + +accentCommands :: PandocMonad m => LP m Inlines -> M.Map Text (LP m Inlines) +accentCommands tok = + let accent = accentWith tok + in M.fromList + [ ("aa", lit "å") + , ("AA", lit "Å") + , ("ss", lit "ß") + , ("o", lit "ø") + , ("O", lit "Ø") + , ("L", lit "Ł") + , ("l", lit "ł") + , ("ae", lit "æ") + , ("AE", lit "Æ") + , ("oe", lit "œ") + , ("OE", lit "Œ") + , ("pounds", lit "£") + , ("euro", lit "€") + , ("copyright", lit "©") + , ("textasciicircum", lit "^") + , ("textasciitilde", lit "~") + , ("H", accent '\779' Nothing) -- hungarumlaut + , ("`", accent '\768' (Just '`')) -- grave + , ("'", accent '\769' (Just '\'')) -- acute + , ("^", accent '\770' (Just '^')) -- circ + , ("~", accent '\771' (Just '~')) -- tilde + , ("\"", accent '\776' Nothing) -- umlaut + , (".", accent '\775' Nothing) -- dot + , ("=", accent '\772' Nothing) -- macron + , ("|", accent '\781' Nothing) -- vertical line above + , ("b", accent '\817' Nothing) -- macron below + , ("c", accent '\807' Nothing) -- cedilla + , ("G", accent '\783' Nothing) -- doublegrave + , ("h", accent '\777' Nothing) -- hookabove + , ("d", accent '\803' Nothing) -- dotbelow + , ("f", accent '\785' Nothing) -- inverted breve + , ("r", accent '\778' Nothing) -- ringabove + , ("t", accent '\865' Nothing) -- double inverted breve + , ("U", accent '\782' Nothing) -- double vertical line above + , ("v", accent '\780' Nothing) -- hacek + , ("u", accent '\774' Nothing) -- breve + , ("k", accent '\808' Nothing) -- ogonek + , ("textogonekcentered", accent '\808' Nothing) -- ogonek + , ("i", lit "ı") -- dotless i + , ("j", lit "ȷ") -- dotless j + , ("newtie", accent '\785' Nothing) -- inverted breve + , ("textcircled", accent '\8413' Nothing) -- combining circle + ] + +charCommands :: PandocMonad m => M.Map Text (LP m Inlines) +charCommands = M.fromList + [ ("ldots", lit "…") + , ("vdots", lit "\8942") + , ("dots", lit "…") + , ("mdots", lit "…") + , ("sim", lit "~") + , ("sep", lit ",") + , ("P", lit "¶") + , ("S", lit "§") + , ("$", lit "$") + , ("%", lit "%") + , ("&", lit "&") + , ("#", lit "#") + , ("_", lit "_") + , ("{", lit "{") + , ("}", lit "}") + , ("qed", lit "\a0\x25FB") + , ("lq", return (str "‘")) + , ("rq", return (str "’")) + , ("textquoteleft", return (str "‘")) + , ("textquoteright", return (str "’")) + , ("textquotedblleft", return (str "“")) + , ("textquotedblright", return (str "”")) + , ("/", pure mempty) -- italic correction + , ("\\", linebreak <$ (do inTableCell <- sInTableCell <$> getState + guard $ not inTableCell + optional rawopt + spaces)) + , (",", lit "\8198") + , ("@", pure mempty) + , (" ", lit "\160") + , ("ps", pure $ str "PS." <> space) + , ("TeX", lit "TeX") + , ("LaTeX", lit "LaTeX") + , ("bar", lit "|") + , ("textless", lit "<") + , ("textgreater", lit ">") + , ("textbackslash", lit "\\") + , ("backslash", lit "\\") + , ("slash", lit "/") + -- fontawesome + , ("faCheck", lit "\10003") + , ("faClose", lit "\10007") + -- hyphenat + , ("bshyp", lit "\\\173") + , ("fshyp", lit "/\173") + , ("dothyp", lit ".\173") + , ("colonhyp", lit ":\173") + , ("hyp", lit "-") + ] + +biblatexInlineCommands :: PandocMonad m + => LP m Inlines -> M.Map Text (LP m Inlines) +biblatexInlineCommands tok = M.fromList + -- biblatex misc + [ ("RN", romanNumeralUpper) + , ("Rn", romanNumeralLower) + , ("mkbibquote", spanWith nullAttr . doubleQuoted <$> tok) + , ("mkbibemph", spanWith nullAttr . emph <$> tok) + , ("mkbibitalic", spanWith nullAttr . emph <$> tok) + , ("mkbibbold", spanWith nullAttr . strong <$> tok) + , ("mkbibparens", + spanWith nullAttr . (\x -> str "(" <> x <> str ")") <$> tok) + , ("mkbibbrackets", + spanWith nullAttr . (\x -> str "[" <> x <> str "]") <$> tok) + , ("autocap", spanWith nullAttr <$> tok) + , ("textnormal", spanWith ("",["nodecor"],[]) <$> tok) + , ("bibstring", + (\x -> spanWith ("",[],[("bibstring",x)]) (str x)) . untokenize + <$> braced) + , ("adddot", pure (str ".")) + , ("adddotspace", pure (spanWith nullAttr (str "." <> space))) + , ("addabbrvspace", pure space) + , ("hyphen", pure (str "-")) + ] + +nameCommands :: PandocMonad m => M.Map Text (LP m Inlines) +nameCommands = M.fromList + [ ("figurename", doTerm Translations.Figure) + , ("prefacename", doTerm Translations.Preface) + , ("refname", doTerm Translations.References) + , ("bibname", doTerm Translations.Bibliography) + , ("chaptername", doTerm Translations.Chapter) + , ("partname", doTerm Translations.Part) + , ("contentsname", doTerm Translations.Contents) + , ("listfigurename", doTerm Translations.ListOfFigures) + , ("listtablename", doTerm Translations.ListOfTables) + , ("indexname", doTerm Translations.Index) + , ("abstractname", doTerm Translations.Abstract) + , ("tablename", doTerm Translations.Table) + , ("enclname", doTerm Translations.Encl) + , ("ccname", doTerm Translations.Cc) + , ("headtoname", doTerm Translations.To) + , ("pagename", doTerm Translations.Page) + , ("seename", doTerm Translations.See) + , ("seealsoname", doTerm Translations.SeeAlso) + , ("proofname", doTerm Translations.Proof) + , ("glossaryname", doTerm Translations.Glossary) + , ("lstlistingname", doTerm Translations.Listing) + ] + +refCommands :: PandocMonad m => M.Map Text (LP m Inlines) +refCommands = M.fromList + [ ("label", rawInlineOr "label" dolabel) + , ("ref", rawInlineOr "ref" $ doref "ref") + , ("cref", rawInlineOr "cref" $ doref "ref") -- from cleveref.sty + , ("vref", rawInlineOr "vref" $ doref "ref+page") -- from varioref.sty + , ("eqref", rawInlineOr "eqref" $ doref "eqref") -- from amsmath.sty + ] + +acronymCommands :: PandocMonad m => M.Map Text (LP m Inlines) +acronymCommands = M.fromList + -- glossaries package + [ ("gls", doAcronym "short") + , ("Gls", doAcronym "short") + , ("glsdesc", doAcronym "long") + , ("Glsdesc", doAcronym "long") + , ("GLSdesc", doAcronym "long") + , ("acrlong", doAcronym "long") + , ("Acrlong", doAcronym "long") + , ("acrfull", doAcronym "full") + , ("Acrfull", doAcronym "full") + , ("acrshort", doAcronym "abbrv") + , ("Acrshort", doAcronym "abbrv") + , ("glspl", doAcronymPlural "short") + , ("Glspl", doAcronymPlural "short") + , ("glsdescplural", doAcronymPlural "long") + , ("Glsdescplural", doAcronymPlural "long") + , ("GLSdescplural", doAcronymPlural "long") + -- acronyms package + , ("ac", doAcronym "short") + , ("acf", doAcronym "full") + , ("acs", doAcronym "abbrv") + , ("acl", doAcronym "long") + , ("acp", doAcronymPlural "short") + , ("acfp", doAcronymPlural "full") + , ("acsp", doAcronymPlural "abbrv") + , ("aclp", doAcronymPlural "long") + , ("Ac", doAcronym "short") + , ("Acf", doAcronym "full") + , ("Acs", doAcronym "abbrv") + , ("Acl", doAcronym "long") + , ("Acp", doAcronymPlural "short") + , ("Acfp", doAcronymPlural "full") + , ("Acsp", doAcronymPlural "abbrv") + , ("Aclp", doAcronymPlural "long") + ] + +doAcronym :: PandocMonad m => Text -> LP m Inlines +doAcronym form = do + acro <- braced + return . mconcat $ [spanWith ("",[],[("acronym-label", untokenize acro), + ("acronym-form", "singular+" <> form)]) + $ str $ untokenize acro] + +doAcronymPlural :: PandocMonad m => Text -> LP m Inlines +doAcronymPlural form = do + acro <- braced + let plural = str "s" + return . mconcat $ [spanWith ("",[],[("acronym-label", untokenize acro), + ("acronym-form", "plural+" <> form)]) $ + mconcat [str $ untokenize acro, plural]] + + diff --git a/src/Text/Pandoc/Readers/LaTeX/Lang.hs b/src/Text/Pandoc/Readers/LaTeX/Lang.hs index 814b2fe79..6a8327904 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Lang.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Lang.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.LaTeX.Lang - Copyright : Copyright (C) 2018-2020 John MacFarlane + Copyright : Copyright (C) 2018-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -12,144 +12,223 @@ Functions for parsing polyglossia and babel language specifiers to BCP47 'Lang'. -} module Text.Pandoc.Readers.LaTeX.Lang - ( polyglossiaLangToBCP47 + ( setDefaultLanguage + , polyglossiaLangToBCP47 , babelLangToBCP47 + , enquoteCommands + , inlineLanguageCommands ) where import qualified Data.Map as M +import Data.Text (Text) import qualified Data.Text as T -import Text.Pandoc.BCP47 (Lang(..)) +import Text.Pandoc.Shared (extractSpaces) +import Text.Collate.Lang (Lang(..), renderLang) +import Text.Pandoc.Class (PandocMonad(..), setTranslations) +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Parsing (updateState, option, getState, QuoteContext(..), + withQuoteContext) +import Text.Pandoc.Builder (Blocks, Inlines, setMeta, str, spanWith, + singleQuoted, doubleQuoted) + +enquote :: PandocMonad m + => LP m Inlines + -> Bool -> Maybe Text -> LP m Inlines +enquote tok starred mblang = do + skipopts + let lang = mblang >>= babelLangToBCP47 + let langspan = case lang of + Nothing -> id + Just l -> spanWith ("",[],[("lang", renderLang l)]) + quoteContext <- sQuoteContext <$> getState + if starred || quoteContext == InDoubleQuote + then singleQuoted . langspan <$> withQuoteContext InSingleQuote tok + else doubleQuoted . langspan <$> withQuoteContext InDoubleQuote tok + +enquoteCommands :: PandocMonad m + => LP m Inlines -> M.Map Text (LP m Inlines) +enquoteCommands tok = M.fromList + [ ("enquote*", enquote tok True Nothing) + , ("enquote", enquote tok False Nothing) + -- foreignquote is supposed to use native quote marks + , ("foreignquote*", braced >>= enquote tok True . Just . untokenize) + , ("foreignquote", braced >>= enquote tok False . Just . untokenize) + -- hypehnquote uses regular quotes + , ("hyphenquote*", braced >>= enquote tok True . Just . untokenize) + , ("hyphenquote", braced >>= enquote tok False . Just . untokenize) + ] + +foreignlanguage :: PandocMonad m => LP m Inlines -> LP m Inlines +foreignlanguage tok = do + babelLang <- untokenize <$> braced + case babelLangToBCP47 babelLang of + Just lang -> spanWith ("", [], [("lang", renderLang lang)]) <$> tok + _ -> tok + +inlineLanguageCommands :: PandocMonad m + => LP m Inlines -> M.Map Text (LP m Inlines) +inlineLanguageCommands tok = + M.fromList $ + ("foreignlanguage", foreignlanguage tok) : + (mk <$> M.toList polyglossiaLangToBCP47) + where + mk (polyglossia, bcp47Func) = + ("text" <> polyglossia, inlineLanguage tok bcp47Func) + +inlineLanguage :: PandocMonad m + => LP m Inlines -> (Text -> Lang) -> LP m Inlines +inlineLanguage tok bcp47Func = do + o <- option "" $ T.filter (\c -> c /= '[' && c /= ']') + <$> rawopt + let lang = renderLang $ bcp47Func o + extractSpaces (spanWith ("", [], [("lang", lang)])) <$> tok + +setDefaultLanguage :: PandocMonad m => LP m Blocks +setDefaultLanguage = do + o <- option "" $ T.filter (\c -> c /= '[' && c /= ']') + <$> rawopt + polylang <- untokenize <$> braced + case M.lookup polylang polyglossiaLangToBCP47 of + Nothing -> return mempty -- TODO mzero? warning? + Just langFunc -> do + let l = langFunc o + setTranslations l + updateState $ setMeta "lang" $ str (renderLang l) + return mempty polyglossiaLangToBCP47 :: M.Map T.Text (T.Text -> Lang) polyglossiaLangToBCP47 = M.fromList [ ("arabic", \o -> case T.filter (/=' ') o of - "locale=algeria" -> Lang "ar" "" "DZ" [] - "locale=mashriq" -> Lang "ar" "" "SY" [] - "locale=libya" -> Lang "ar" "" "LY" [] - "locale=morocco" -> Lang "ar" "" "MA" [] - "locale=mauritania" -> Lang "ar" "" "MR" [] - "locale=tunisia" -> Lang "ar" "" "TN" [] - _ -> Lang "ar" "" "" []) + "locale=algeria" -> Lang "ar" Nothing (Just "DZ") [] [] [] + "locale=mashriq" -> Lang "ar" Nothing (Just "SY") [] [] [] + "locale=libya" -> Lang "ar" Nothing (Just "LY") [] [] [] + "locale=morocco" -> Lang "ar" Nothing (Just "MA") [] [] [] + "locale=mauritania" -> Lang "ar" Nothing (Just "MR") [] [] [] + "locale=tunisia" -> Lang "ar" Nothing (Just "TN") [] [] [] + _ -> Lang "ar" Nothing (Just "") [] [] []) , ("german", \o -> case T.filter (/=' ') o of - "spelling=old" -> Lang "de" "" "DE" ["1901"] + "spelling=old" -> Lang "de" Nothing (Just "DE") ["1901"] [] [] "variant=austrian,spelling=old" - -> Lang "de" "" "AT" ["1901"] - "variant=austrian" -> Lang "de" "" "AT" [] + -> Lang "de" Nothing (Just "AT") ["1901"] [] [] + "variant=austrian" -> Lang "de" Nothing (Just "AT") [] [] [] "variant=swiss,spelling=old" - -> Lang "de" "" "CH" ["1901"] - "variant=swiss" -> Lang "de" "" "CH" [] - _ -> Lang "de" "" "" []) - , ("lsorbian", \_ -> Lang "dsb" "" "" []) + -> Lang "de" Nothing (Just "CH") ["1901"] [] [] + "variant=swiss" -> Lang "de" Nothing (Just "CH") [] [] [] + _ -> Lang "de" Nothing Nothing [] [] []) + , ("lsorbian", \_ -> Lang "dsb" Nothing Nothing [] [] []) , ("greek", \o -> case T.filter (/=' ') o of - "variant=poly" -> Lang "el" "" "polyton" [] - "variant=ancient" -> Lang "grc" "" "" [] - _ -> Lang "el" "" "" []) + "variant=poly" -> Lang "el" Nothing (Just "polyton") [] [] [] + "variant=ancient" -> Lang "grc" Nothing Nothing [] [] [] + _ -> Lang "el" Nothing Nothing [] [] []) , ("english", \o -> case T.filter (/=' ') o of - "variant=australian" -> Lang "en" "" "AU" [] - "variant=canadian" -> Lang "en" "" "CA" [] - "variant=british" -> Lang "en" "" "GB" [] - "variant=newzealand" -> Lang "en" "" "NZ" [] - "variant=american" -> Lang "en" "" "US" [] - _ -> Lang "en" "" "" []) - , ("usorbian", \_ -> Lang "hsb" "" "" []) + "variant=australian" -> Lang "en" Nothing (Just "AU") [] [] [] + "variant=canadian" -> Lang "en" Nothing (Just "CA") [] [] [] + "variant=british" -> Lang "en" Nothing (Just "GB") [] [] [] + "variant=newzealand" -> Lang "en" Nothing (Just "NZ") [] [] [] + "variant=american" -> Lang "en" Nothing (Just "US") [] [] [] + _ -> Lang "en" Nothing (Just "") [] [] []) + , ("usorbian", \_ -> Lang "hsb" Nothing Nothing [] [] []) , ("latin", \o -> case T.filter (/=' ') o of - "variant=classic" -> Lang "la" "" "" ["x-classic"] - _ -> Lang "la" "" "" []) - , ("slovenian", \_ -> Lang "sl" "" "" []) - , ("serbianc", \_ -> Lang "sr" "cyrl" "" []) - , ("pinyin", \_ -> Lang "zh" "Latn" "" ["pinyin"]) - , ("afrikaans", \_ -> Lang "af" "" "" []) - , ("amharic", \_ -> Lang "am" "" "" []) - , ("assamese", \_ -> Lang "as" "" "" []) - , ("asturian", \_ -> Lang "ast" "" "" []) - , ("bulgarian", \_ -> Lang "bg" "" "" []) - , ("bengali", \_ -> Lang "bn" "" "" []) - , ("tibetan", \_ -> Lang "bo" "" "" []) - , ("breton", \_ -> Lang "br" "" "" []) - , ("catalan", \_ -> Lang "ca" "" "" []) - , ("welsh", \_ -> Lang "cy" "" "" []) - , ("czech", \_ -> Lang "cs" "" "" []) - , ("coptic", \_ -> Lang "cop" "" "" []) - , ("danish", \_ -> Lang "da" "" "" []) - , ("divehi", \_ -> Lang "dv" "" "" []) - , ("esperanto", \_ -> Lang "eo" "" "" []) - , ("spanish", \_ -> Lang "es" "" "" []) - , ("estonian", \_ -> Lang "et" "" "" []) - , ("basque", \_ -> Lang "eu" "" "" []) - , ("farsi", \_ -> Lang "fa" "" "" []) - , ("finnish", \_ -> Lang "fi" "" "" []) - , ("french", \_ -> Lang "fr" "" "" []) - , ("friulan", \_ -> Lang "fur" "" "" []) - , ("irish", \_ -> Lang "ga" "" "" []) - , ("scottish", \_ -> Lang "gd" "" "" []) - , ("ethiopic", \_ -> Lang "gez" "" "" []) - , ("galician", \_ -> Lang "gl" "" "" []) - , ("hebrew", \_ -> Lang "he" "" "" []) - , ("hindi", \_ -> Lang "hi" "" "" []) - , ("croatian", \_ -> Lang "hr" "" "" []) - , ("magyar", \_ -> Lang "hu" "" "" []) - , ("armenian", \_ -> Lang "hy" "" "" []) - , ("interlingua", \_ -> Lang "ia" "" "" []) - , ("indonesian", \_ -> Lang "id" "" "" []) - , ("icelandic", \_ -> Lang "is" "" "" []) - , ("italian", \_ -> Lang "it" "" "" []) - , ("japanese", \_ -> Lang "jp" "" "" []) - , ("khmer", \_ -> Lang "km" "" "" []) - , ("kurmanji", \_ -> Lang "kmr" "" "" []) - , ("kannada", \_ -> Lang "kn" "" "" []) - , ("korean", \_ -> Lang "ko" "" "" []) - , ("lao", \_ -> Lang "lo" "" "" []) - , ("lithuanian", \_ -> Lang "lt" "" "" []) - , ("latvian", \_ -> Lang "lv" "" "" []) - , ("malayalam", \_ -> Lang "ml" "" "" []) - , ("mongolian", \_ -> Lang "mn" "" "" []) - , ("marathi", \_ -> Lang "mr" "" "" []) - , ("dutch", \_ -> Lang "nl" "" "" []) - , ("nynorsk", \_ -> Lang "nn" "" "" []) - , ("norsk", \_ -> Lang "no" "" "" []) - , ("nko", \_ -> Lang "nqo" "" "" []) - , ("occitan", \_ -> Lang "oc" "" "" []) - , ("panjabi", \_ -> Lang "pa" "" "" []) - , ("polish", \_ -> Lang "pl" "" "" []) - , ("piedmontese", \_ -> Lang "pms" "" "" []) - , ("portuguese", \_ -> Lang "pt" "" "" []) - , ("romansh", \_ -> Lang "rm" "" "" []) - , ("romanian", \_ -> Lang "ro" "" "" []) - , ("russian", \_ -> Lang "ru" "" "" []) - , ("sanskrit", \_ -> Lang "sa" "" "" []) - , ("samin", \_ -> Lang "se" "" "" []) - , ("slovak", \_ -> Lang "sk" "" "" []) - , ("albanian", \_ -> Lang "sq" "" "" []) - , ("serbian", \_ -> Lang "sr" "" "" []) - , ("swedish", \_ -> Lang "sv" "" "" []) - , ("syriac", \_ -> Lang "syr" "" "" []) - , ("tamil", \_ -> Lang "ta" "" "" []) - , ("telugu", \_ -> Lang "te" "" "" []) - , ("thai", \_ -> Lang "th" "" "" []) - , ("turkmen", \_ -> Lang "tk" "" "" []) - , ("turkish", \_ -> Lang "tr" "" "" []) - , ("ukrainian", \_ -> Lang "uk" "" "" []) - , ("urdu", \_ -> Lang "ur" "" "" []) - , ("vietnamese", \_ -> Lang "vi" "" "" []) + "variant=classic" -> Lang "la" Nothing Nothing ["x-classic"] [] [] + _ -> Lang "la" Nothing Nothing [] [] []) + , ("slovenian", \_ -> Lang "sl" Nothing Nothing [] [] []) + , ("serbianc", \_ -> Lang "sr" (Just "Cyrl") Nothing [] [] []) + , ("pinyin", \_ -> Lang "zh" (Just "Latn") Nothing ["pinyin"] [] []) + , ("afrikaans", \_ -> simpleLang "af") + , ("amharic", \_ -> simpleLang "am") + , ("assamese", \_ -> simpleLang "as") + , ("asturian", \_ -> simpleLang "ast") + , ("bulgarian", \_ -> simpleLang "bg") + , ("bengali", \_ -> simpleLang "bn") + , ("tibetan", \_ -> simpleLang "bo") + , ("breton", \_ -> simpleLang "br") + , ("catalan", \_ -> simpleLang "ca") + , ("welsh", \_ -> simpleLang "cy") + , ("czech", \_ -> simpleLang "cs") + , ("coptic", \_ -> simpleLang "cop") + , ("danish", \_ -> simpleLang "da") + , ("divehi", \_ -> simpleLang "dv") + , ("esperanto", \_ -> simpleLang "eo") + , ("spanish", \_ -> simpleLang "es") + , ("estonian", \_ -> simpleLang "et") + , ("basque", \_ -> simpleLang "eu") + , ("farsi", \_ -> simpleLang "fa") + , ("finnish", \_ -> simpleLang "fi") + , ("french", \_ -> simpleLang "fr") + , ("friulan", \_ -> simpleLang "fur") + , ("irish", \_ -> simpleLang "ga") + , ("scottish", \_ -> simpleLang "gd") + , ("ethiopic", \_ -> simpleLang "gez") + , ("galician", \_ -> simpleLang "gl") + , ("hebrew", \_ -> simpleLang "he") + , ("hindi", \_ -> simpleLang "hi") + , ("croatian", \_ -> simpleLang "hr") + , ("magyar", \_ -> simpleLang "hu") + , ("armenian", \_ -> simpleLang "hy") + , ("interlingua", \_ -> simpleLang "ia") + , ("indonesian", \_ -> simpleLang "id") + , ("icelandic", \_ -> simpleLang "is") + , ("italian", \_ -> simpleLang "it") + , ("japanese", \_ -> simpleLang "jp") + , ("khmer", \_ -> simpleLang "km") + , ("kurmanji", \_ -> simpleLang "kmr") + , ("kannada", \_ -> simpleLang "kn") + , ("korean", \_ -> simpleLang "ko") + , ("lao", \_ -> simpleLang "lo") + , ("lithuanian", \_ -> simpleLang "lt") + , ("latvian", \_ -> simpleLang "lv") + , ("malayalam", \_ -> simpleLang "ml") + , ("mongolian", \_ -> simpleLang "mn") + , ("marathi", \_ -> simpleLang "mr") + , ("dutch", \_ -> simpleLang "nl") + , ("nynorsk", \_ -> simpleLang "nn") + , ("norsk", \_ -> simpleLang "no") + , ("nko", \_ -> simpleLang "nqo") + , ("occitan", \_ -> simpleLang "oc") + , ("panjabi", \_ -> simpleLang "pa") + , ("polish", \_ -> simpleLang "pl") + , ("piedmontese", \_ -> simpleLang "pms") + , ("portuguese", \_ -> simpleLang "pt") + , ("romansh", \_ -> simpleLang "rm") + , ("romanian", \_ -> simpleLang "ro") + , ("russian", \_ -> simpleLang "ru") + , ("sanskrit", \_ -> simpleLang "sa") + , ("samin", \_ -> simpleLang "se") + , ("slovak", \_ -> simpleLang "sk") + , ("albanian", \_ -> simpleLang "sq") + , ("serbian", \_ -> simpleLang "sr") + , ("swedish", \_ -> simpleLang "sv") + , ("syriac", \_ -> simpleLang "syr") + , ("tamil", \_ -> simpleLang "ta") + , ("telugu", \_ -> simpleLang "te") + , ("thai", \_ -> simpleLang "th") + , ("turkmen", \_ -> simpleLang "tk") + , ("turkish", \_ -> simpleLang "tr") + , ("ukrainian", \_ -> simpleLang "uk") + , ("urdu", \_ -> simpleLang "ur") + , ("vietnamese", \_ -> simpleLang "vi") ] +simpleLang :: Text -> Lang +simpleLang l = Lang l Nothing Nothing [] [] [] + babelLangToBCP47 :: T.Text -> Maybe Lang babelLangToBCP47 s = case s of - "austrian" -> Just $ Lang "de" "" "AT" ["1901"] - "naustrian" -> Just $ Lang "de" "" "AT" [] - "swissgerman" -> Just $ Lang "de" "" "CH" ["1901"] - "nswissgerman" -> Just $ Lang "de" "" "CH" [] - "german" -> Just $ Lang "de" "" "DE" ["1901"] - "ngerman" -> Just $ Lang "de" "" "DE" [] - "lowersorbian" -> Just $ Lang "dsb" "" "" [] - "uppersorbian" -> Just $ Lang "hsb" "" "" [] - "polutonikogreek" -> Just $ Lang "el" "" "" ["polyton"] - "slovene" -> Just $ Lang "sl" "" "" [] - "australian" -> Just $ Lang "en" "" "AU" [] - "canadian" -> Just $ Lang "en" "" "CA" [] - "british" -> Just $ Lang "en" "" "GB" [] - "newzealand" -> Just $ Lang "en" "" "NZ" [] - "american" -> Just $ Lang "en" "" "US" [] - "classiclatin" -> Just $ Lang "la" "" "" ["x-classic"] + "austrian" -> Just $ Lang "de" Nothing (Just "AT") ["1901"] [] [] + "naustrian" -> Just $ Lang "de" Nothing (Just "AT") [] [] [] + "swissgerman" -> Just $ Lang "de" Nothing (Just "CH") ["1901"] [] [] + "nswissgerman" -> Just $ Lang "de" Nothing (Just "CH") [] [] [] + "german" -> Just $ Lang "de" Nothing (Just "DE") ["1901"] [] [] + "ngerman" -> Just $ Lang "de" Nothing (Just "DE") [] [] [] + "lowersorbian" -> Just $ Lang "dsb" Nothing Nothing [] [] [] + "uppersorbian" -> Just $ Lang "hsb" Nothing Nothing [] [] [] + "polutonikogreek" -> Just $ Lang "el" Nothing Nothing ["polyton"] [] [] + "slovene" -> Just $ simpleLang "sl" + "australian" -> Just $ Lang "en" Nothing (Just "AU") [] [] [] + "canadian" -> Just $ Lang "en" Nothing (Just "CA") [] [] [] + "british" -> Just $ Lang "en" Nothing (Just "GB") [] [] [] + "newzealand" -> Just $ Lang "en" Nothing (Just "NZ") [] [] [] + "american" -> Just $ Lang "en" Nothing (Just "US") [] [] [] + "classiclatin" -> Just $ Lang "la" Nothing Nothing ["x-classic"] [] [] _ -> ($ "") <$> M.lookup s polyglossiaLangToBCP47 diff --git a/src/Text/Pandoc/Readers/LaTeX/Macro.hs b/src/Text/Pandoc/Readers/LaTeX/Macro.hs new file mode 100644 index 000000000..5495a8e74 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Macro.hs @@ -0,0 +1,184 @@ +{-# LANGUAGE OverloadedStrings #-} +module Text.Pandoc.Readers.LaTeX.Macro + ( macroDef + ) +where +import Text.Pandoc.Extensions (Extension(..)) +import Text.Pandoc.Logging (LogMessage(MacroAlreadyDefined)) +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Readers.LaTeX.Types +import Text.Pandoc.Class +import Text.Pandoc.Shared (safeRead) +import Text.Pandoc.Parsing hiding (blankline, mathDisplay, mathInline, + optional, space, spaces, withRaw, (<|>)) +import Control.Applicative ((<|>), optional) +import qualified Data.Map as M +import Data.Text (Text) +import qualified Data.Text as T + +macroDef :: (PandocMonad m, Monoid a) => (Text -> a) -> LP m a +macroDef constructor = do + (_, s) <- withRaw (commandDef <|> environmentDef) + (constructor (untokenize s) <$ + guardDisabled Ext_latex_macros) + <|> return mempty + where commandDef = do + nameMacroPairs <- newcommand <|> letmacro <|> defmacro <|> newif + guardDisabled Ext_latex_macros <|> + mapM_ (\(name, macro') -> + updateState (\s -> s{ sMacros = M.insert name macro' + (sMacros s) })) nameMacroPairs + environmentDef = do + mbenv <- newenvironment + case mbenv of + Nothing -> return () + Just (name, macro1, macro2) -> + guardDisabled Ext_latex_macros <|> + do updateState $ \s -> s{ sMacros = + M.insert name macro1 (sMacros s) } + updateState $ \s -> s{ sMacros = + M.insert ("end" <> name) macro2 (sMacros s) } + -- @\newenvironment{envname}[n-args][default]{begin}{end}@ + -- is equivalent to + -- @\newcommand{\envname}[n-args][default]{begin}@ + -- @\newcommand{\endenvname}@ + +letmacro :: PandocMonad m => LP m [(Text, Macro)] +letmacro = do + controlSeq "let" + (name, contents) <- withVerbatimMode $ do + Tok _ (CtrlSeq name) _ <- anyControlSeq + optional $ symbol '=' + spaces + -- we first parse in verbatim mode, and then expand macros, + -- because we don't want \let\foo\bar to turn into + -- \let\foo hello if we have previously \def\bar{hello} + contents <- bracedOrToken + return (name, contents) + contents' <- doMacros' 0 contents + return [(name, Macro ExpandWhenDefined [] Nothing contents')] + +defmacro :: PandocMonad m => LP m [(Text, Macro)] +defmacro = do + -- we use withVerbatimMode, because macros are to be expanded + -- at point of use, not point of definition + controlSeq "def" + withVerbatimMode $ do + Tok _ (CtrlSeq name) _ <- anyControlSeq + argspecs <- many (argspecArg <|> argspecPattern) + contents <- bracedOrToken + return [(name, Macro ExpandWhenUsed argspecs Nothing contents)] + +-- \newif\iffoo' defines: +-- \iffoo to be \iffalse +-- \footrue to be a command that defines \iffoo to be \iftrue +-- \foofalse to be a command that defines \iffoo to be \iffalse +newif :: PandocMonad m => LP m [(Text, Macro)] +newif = do + controlSeq "newif" + withVerbatimMode $ do + Tok pos (CtrlSeq name) _ <- anyControlSeq + -- \def\iffoo\iffalse + -- \def\footrue{\def\iffoo\iftrue} + -- \def\foofalse{\def\iffoo\iffalse} + let base = T.drop 2 name + return [ (name, Macro ExpandWhenUsed [] Nothing + [Tok pos (CtrlSeq "iffalse") "\\iffalse"]) + , (base <> "true", + Macro ExpandWhenUsed [] Nothing + [ Tok pos (CtrlSeq "def") "\\def" + , Tok pos (CtrlSeq name) ("\\" <> name) + , Tok pos (CtrlSeq "iftrue") "\\iftrue" + ]) + , (base <> "false", + Macro ExpandWhenUsed [] Nothing + [ Tok pos (CtrlSeq "def") "\\def" + , Tok pos (CtrlSeq name) ("\\" <> name) + , Tok pos (CtrlSeq "iffalse") "\\iffalse" + ]) + ] + +argspecArg :: PandocMonad m => LP m ArgSpec +argspecArg = do + Tok _ (Arg i) _ <- satisfyTok isArgTok + return $ ArgNum i + +argspecPattern :: PandocMonad m => LP m ArgSpec +argspecPattern = + Pattern <$> many1 (satisfyTok (\(Tok _ toktype' txt) -> + (toktype' == Symbol || toktype' == Word) && + (txt /= "{" && txt /= "\\" && txt /= "}"))) + +newcommand :: PandocMonad m => LP m [(Text, Macro)] +newcommand = do + Tok pos (CtrlSeq mtype) _ <- controlSeq "newcommand" <|> + controlSeq "renewcommand" <|> + controlSeq "providecommand" <|> + controlSeq "DeclareMathOperator" <|> + controlSeq "DeclareRobustCommand" + withVerbatimMode $ do + Tok _ (CtrlSeq name) txt <- do + optional (symbol '*') + anyControlSeq <|> + (symbol '{' *> spaces *> anyControlSeq <* spaces <* symbol '}') + spaces + numargs <- option 0 $ try bracketedNum + let argspecs = map ArgNum [1..numargs] + spaces + optarg <- option Nothing $ Just <$> try bracketedToks + spaces + contents' <- bracedOrToken + let contents = + case mtype of + "DeclareMathOperator" -> + Tok pos (CtrlSeq "mathop") "\\mathop" + : Tok pos Symbol "{" + : Tok pos (CtrlSeq "mathrm") "\\mathrm" + : Tok pos Symbol "{" + : (contents' ++ + [ Tok pos Symbol "}", Tok pos Symbol "}" ]) + _ -> contents' + macros <- sMacros <$> getState + case M.lookup name macros of + Just macro + | mtype == "newcommand" -> do + report $ MacroAlreadyDefined txt pos + return [(name, macro)] + | mtype == "providecommand" -> return [(name, macro)] + _ -> return [(name, Macro ExpandWhenUsed argspecs optarg contents)] + +newenvironment :: PandocMonad m => LP m (Maybe (Text, Macro, Macro)) +newenvironment = do + pos <- getPosition + Tok _ (CtrlSeq mtype) _ <- controlSeq "newenvironment" <|> + controlSeq "renewenvironment" <|> + controlSeq "provideenvironment" + withVerbatimMode $ do + optional $ symbol '*' + spaces + name <- untokenize <$> braced + spaces + numargs <- option 0 $ try bracketedNum + spaces + optarg <- option Nothing $ Just <$> try bracketedToks + let argspecs = map (\i -> ArgNum i) [1..numargs] + startcontents <- spaces >> bracedOrToken + endcontents <- spaces >> bracedOrToken + macros <- sMacros <$> getState + case M.lookup name macros of + Just _ + | mtype == "newenvironment" -> do + report $ MacroAlreadyDefined name pos + return Nothing + | mtype == "provideenvironment" -> + return Nothing + _ -> return $ Just (name, + Macro ExpandWhenUsed argspecs optarg startcontents, + Macro ExpandWhenUsed [] Nothing endcontents) + +bracketedNum :: PandocMonad m => LP m Int +bracketedNum = do + ds <- untokenize <$> bracketedToks + case safeRead ds of + Just i -> return i + _ -> return 0 diff --git a/src/Text/Pandoc/Readers/LaTeX/Math.hs b/src/Text/Pandoc/Readers/LaTeX/Math.hs new file mode 100644 index 000000000..5b49a0376 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Math.hs @@ -0,0 +1,221 @@ +{-# LANGUAGE OverloadedStrings #-} +module Text.Pandoc.Readers.LaTeX.Math + ( dollarsMath + , inlineEnvironments + , inlineEnvironment + , mathInline + , mathDisplay + , theoremstyle + , theoremEnvironment + , newtheorem + , proof + ) +where +import Data.Maybe (fromMaybe) +import Text.Pandoc.Walk (walk) +import Text.Pandoc.Builder as B +import qualified Data.Sequence as Seq +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Readers.LaTeX.Types +import Text.Pandoc.Class +import Text.Pandoc.Shared (trimMath, stripTrailingNewlines) +import Text.Pandoc.Parsing hiding (blankline, mathDisplay, mathInline, + optional, space, spaces, withRaw, (<|>)) +import Control.Applicative ((<|>), optional) +import Control.Monad (guard, mzero) +import qualified Data.Map as M +import Data.Text (Text) + +dollarsMath :: PandocMonad m => LP m Inlines +dollarsMath = do + symbol '$' + display <- option False (True <$ symbol '$') + (do contents <- try $ untokenize <$> pDollarsMath 0 + if display + then mathDisplay contents <$ symbol '$' + else return $ mathInline contents) + <|> (guard display >> return (mathInline "")) + +-- Int is number of embedded groupings +pDollarsMath :: PandocMonad m => Int -> LP m [Tok] +pDollarsMath n = do + tk@(Tok _ toktype t) <- anyTok + case toktype of + Symbol | t == "$" + , n == 0 -> return [] + | t == "\\" -> do + tk' <- anyTok + (tk :) . (tk' :) <$> pDollarsMath n + | t == "{" -> (tk :) <$> pDollarsMath (n+1) + | t == "}" -> + if n > 0 + then (tk :) <$> pDollarsMath (n-1) + else mzero + _ -> (tk :) <$> pDollarsMath n + +mathDisplay :: Text -> Inlines +mathDisplay = displayMath . trimMath + +mathInline :: Text -> Inlines +mathInline = math . trimMath + +mathEnvWith :: PandocMonad m + => (Inlines -> a) -> Maybe Text -> Text -> LP m a +mathEnvWith f innerEnv name = f . mathDisplay . inner <$> mathEnv name + where inner x = case innerEnv of + Nothing -> x + Just y -> "\\begin{" <> y <> "}\n" <> x <> + "\\end{" <> y <> "}" + +mathEnv :: PandocMonad m => Text -> LP m Text +mathEnv name = do + skipopts + optional blankline + res <- manyTill anyTok (end_ name) + return $ stripTrailingNewlines $ untokenize res + +inlineEnvironment :: PandocMonad m => LP m Inlines +inlineEnvironment = try $ do + controlSeq "begin" + name <- untokenize <$> braced + M.findWithDefault mzero name inlineEnvironments + +inlineEnvironments :: PandocMonad m => M.Map Text (LP m Inlines) +inlineEnvironments = M.fromList [ + ("displaymath", mathEnvWith id Nothing "displaymath") + , ("math", math <$> mathEnv "math") + , ("equation", mathEnvWith id Nothing "equation") + , ("equation*", mathEnvWith id Nothing "equation*") + , ("gather", mathEnvWith id (Just "gathered") "gather") + , ("gather*", mathEnvWith id (Just "gathered") "gather*") + , ("multline", mathEnvWith id (Just "gathered") "multline") + , ("multline*", mathEnvWith id (Just "gathered") "multline*") + , ("eqnarray", mathEnvWith id (Just "aligned") "eqnarray") + , ("eqnarray*", mathEnvWith id (Just "aligned") "eqnarray*") + , ("align", mathEnvWith id (Just "aligned") "align") + , ("align*", mathEnvWith id (Just "aligned") "align*") + , ("alignat", mathEnvWith id (Just "aligned") "alignat") + , ("alignat*", mathEnvWith id (Just "aligned") "alignat*") + , ("dmath", mathEnvWith id Nothing "dmath") + , ("dmath*", mathEnvWith id Nothing "dmath*") + , ("dgroup", mathEnvWith id (Just "aligned") "dgroup") + , ("dgroup*", mathEnvWith id (Just "aligned") "dgroup*") + , ("darray", mathEnvWith id (Just "aligned") "darray") + , ("darray*", mathEnvWith id (Just "aligned") "darray*") + ] + +theoremstyle :: PandocMonad m => LP m Blocks +theoremstyle = do + stylename <- untokenize <$> braced + let mbstyle = case stylename of + "plain" -> Just PlainStyle + "definition" -> Just DefinitionStyle + "remark" -> Just RemarkStyle + _ -> Nothing + case mbstyle of + Nothing -> return () + Just sty -> updateState $ \s -> s{ sLastTheoremStyle = sty } + return mempty + +newtheorem :: PandocMonad m => LP m Inlines -> LP m Blocks +newtheorem inline = do + number <- option True (False <$ symbol '*' <* sp) + name <- untokenize <$> braced + sp + series <- option Nothing $ Just . untokenize <$> bracketedToks + sp + showName <- tokWith inline + sp + syncTo <- option Nothing $ Just . untokenize <$> bracketedToks + sty <- sLastTheoremStyle <$> getState + let spec = TheoremSpec { theoremName = showName + , theoremStyle = sty + , theoremSeries = series + , theoremSyncTo = syncTo + , theoremNumber = number + , theoremLastNum = DottedNum [0] } + tmap <- sTheoremMap <$> getState + updateState $ \s -> s{ sTheoremMap = + M.insert name spec tmap } + return mempty + +theoremEnvironment :: PandocMonad m + => LP m Blocks -> LP m Inlines -> Text -> LP m Blocks +theoremEnvironment blocks opt name = do + tmap <- sTheoremMap <$> getState + case M.lookup name tmap of + Nothing -> mzero + Just tspec -> do + optTitle <- option mempty $ (\x -> space <> "(" <> x <> ")") <$> opt + mblabel <- option Nothing $ Just . untokenize <$> + try (spaces >> controlSeq "label" >> spaces >> braced) + bs <- env name blocks + number <- + if theoremNumber tspec + then do + let name' = fromMaybe name $ theoremSeries tspec + num <- getNextNumber + (maybe (DottedNum [0]) theoremLastNum . + M.lookup name' . sTheoremMap) + updateState $ \s -> + s{ sTheoremMap = + M.adjust + (\spec -> spec{ theoremLastNum = num }) + name' + (sTheoremMap s) + } + + case mblabel of + Just ident -> + updateState $ \s -> + s{ sLabels = M.insert ident + (B.toList $ + theoremName tspec <> "\160" <> + str (renderDottedNum num)) (sLabels s) } + Nothing -> return () + return $ space <> B.text (renderDottedNum num) + else return mempty + let titleEmph = case theoremStyle tspec of + PlainStyle -> B.strong + DefinitionStyle -> B.strong + RemarkStyle -> B.emph + let title = titleEmph (theoremName tspec <> number) + <> optTitle <> "." <> space + return $ divWith (fromMaybe "" mblabel, [name], []) $ addTitle title + $ case theoremStyle tspec of + PlainStyle -> walk italicize bs + _ -> bs + + + +proof :: PandocMonad m => LP m Blocks -> LP m Inlines -> LP m Blocks +proof blocks opt = do + title <- option (B.text "Proof") opt + bs <- env "proof" blocks + return $ + B.divWith ("", ["proof"], []) $ + addQed $ addTitle (B.emph (title <> ".")) bs + +addTitle :: Inlines -> Blocks -> Blocks +addTitle ils bs = + case B.toList bs of + (Para xs : rest) + -> B.fromList (Para (B.toList ils ++ (Space : xs)) : rest) + _ -> B.para ils <> bs + +addQed :: Blocks -> Blocks +addQed bs = + case Seq.viewr (B.unMany bs) of + s Seq.:> Para ils + -> B.Many (s Seq.|> Para (ils ++ B.toList qedSign)) + _ -> bs <> B.para qedSign + where + qedSign = B.str "\xa0\x25FB" + +italicize :: Block -> Block +italicize x@(Para [Image{}]) = x -- see #6925 +italicize (Para ils) = Para [Emph ils] +italicize (Plain ils) = Plain [Emph ils] +italicize x = x + + diff --git a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs index 563d32883..9dac4d6ef 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs @@ -1,11 +1,12 @@ {-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Readers.LaTeX.Parsing - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -27,11 +28,15 @@ module Text.Pandoc.Readers.LaTeX.Parsing , rawLaTeXParser , applyMacros , tokenize + , tokenizeSources + , getInputTokens , untokenize , untoken , totoks , toksToString , satisfyTok + , parseFromToks + , disablingWithRaw , doMacros , doMacros' , setpos @@ -52,6 +57,7 @@ module Text.Pandoc.Readers.LaTeX.Parsing , comment , anyTok , singleChar + , tokWith , specialChars , endline , blankline @@ -78,6 +84,11 @@ module Text.Pandoc.Readers.LaTeX.Parsing , rawopt , overlaySpecification , getNextNumber + , label + , setCaption + , resetCaption + , env + , addMeta ) where import Control.Applicative (many, (<|>)) @@ -87,13 +98,15 @@ import Control.Monad.Trans (lift) import Data.Char (chr, isAlphaNum, isDigit, isLetter, ord) import Data.Default import Data.List (intercalate) +import qualified Data.IntMap as IntMap import qualified Data.Map as M import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad, report) -import Text.Pandoc.Error (PandocError (PandocMacroLoop)) +import Text.Pandoc.Error + (PandocError (PandocMacroLoop,PandocShouldNeverHappenError)) import Text.Pandoc.Logging import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline, @@ -102,7 +115,6 @@ import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..), ArgSpec (..), Tok (..), TokType (..)) import Text.Pandoc.Shared import Text.Parsec.Pos --- import Debug.Trace newtype DottedNum = DottedNum [Int] deriving (Show, Eq) @@ -151,7 +163,9 @@ data LaTeXState = LaTeXState{ sOptions :: ReaderOptions , sLabels :: M.Map Text [Inline] , sHasChapters :: Bool , sToggles :: M.Map Text Bool - , sExpanded :: Bool + , sFileContents :: M.Map Text Text + , sEnableWithRaw :: Bool + , sRawTokens :: IntMap.IntMap [Tok] } deriving Show @@ -176,7 +190,9 @@ defaultLaTeXState = LaTeXState{ sOptions = def , sLabels = M.empty , sHasChapters = False , sToggles = M.empty - , sExpanded = False + , sFileContents = M.empty + , sEnableWithRaw = True + , sRawTokens = IntMap.empty } instance PandocMonad m => HasQuoteContext LaTeXState m where @@ -232,21 +248,25 @@ withVerbatimMode parser = do updateState $ \st -> st{ sVerbatimMode = False } return result -rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s) +rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s, Show a) => [Tok] -> Bool -> LP m a -> LP m a - -> ParserT Text s m (a, Text) + -> ParserT Sources s m (a, Text) rawLaTeXParser toks retokenize parser valParser = do pstate <- getState let lstate = def{ sOptions = extractReaderOptions pstate } let lstate' = lstate { sMacros = extractMacros pstate } + let setStartPos = case toks of + Tok pos _ _ : _ -> setPosition pos + _ -> return () + let preparser = setStartPos >> parser let rawparser = (,) <$> withRaw valParser <*> getState - res' <- lift $ runParserT (snd <$> withRaw parser) lstate "chunk" toks + res' <- lift $ runParserT (withRaw (preparser >> getPosition)) + lstate "chunk" toks case res' of Left _ -> mzero - Right toks' -> do + Right (endpos, toks') -> do res <- lift $ runParserT (do when retokenize $ do -- retokenize, applying macros - doMacros ts <- many (satisfyTok (const True)) setInput ts rawparser) @@ -255,7 +275,13 @@ rawLaTeXParser toks retokenize parser valParser = do Left _ -> mzero Right ((val, raw), st) -> do updateState (updateMacros (sMacros st <>)) - _ <- takeP (T.length (untokenize toks')) + let skipTilPos stopPos = do + anyChar + pos <- getPosition + if pos >= stopPos + then return () + else skipTilPos stopPos + skipTilPos endpos let result = untokenize raw -- ensure we end with space if input did, see #4442 let result' = @@ -268,7 +294,7 @@ rawLaTeXParser toks retokenize parser valParser = do return (val, result') applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s) - => Text -> ParserT Text s m Text + => Text -> ParserT Sources s m Text applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|> do let retokenize = untokenize <$> many (satisfyTok (const True)) pstate <- getState @@ -279,6 +305,31 @@ applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|> Left e -> Prelude.fail (show e) Right s' -> return s' +{- +When tokenize or untokenize change, test with this +QuickCheck property: + +> tokUntokRoundtrip :: String -> Bool +> tokUntokRoundtrip s = +> let t = T.pack s in untokenize (tokenize "random" t) == t +-} + +tokenizeSources :: Sources -> [Tok] +tokenizeSources = concatMap tokenizeSource . unSources + where + tokenizeSource (pos, t) = totoks pos t + +-- Return tokens from input sources. Ensure that starting position is +-- correct. +getInputTokens :: PandocMonad m => ParserT Sources s m [Tok] +getInputTokens = do + pos <- getPosition + ss <- getInput + return $ + case ss of + Sources [] -> [] + Sources ((_,t):rest) -> tokenizeSources $ Sources ((pos,t):rest) + tokenize :: SourceName -> Text -> [Tok] tokenize sourcename = totoks (initialPos sourcename) @@ -402,41 +453,62 @@ untoken t = untokenAccum t mempty toksToString :: [Tok] -> String toksToString = T.unpack . untokenize +parseFromToks :: PandocMonad m => LP m a -> [Tok] -> LP m a +parseFromToks parser toks = do + oldInput <- getInput + setInput toks + oldpos <- getPosition + case toks of + Tok pos _ _ : _ -> setPosition pos + _ -> return () + result <- disablingWithRaw parser + setInput oldInput + setPosition oldpos + return result + +disablingWithRaw :: PandocMonad m => LP m a -> LP m a +disablingWithRaw parser = do + oldEnableWithRaw <- sEnableWithRaw <$> getState + updateState $ \st -> st{ sEnableWithRaw = False } + result <- parser + updateState $ \st -> st{ sEnableWithRaw = oldEnableWithRaw } + return result + satisfyTok :: PandocMonad m => (Tok -> Bool) -> LP m Tok satisfyTok f = do doMacros -- apply macros on remaining input stream res <- tokenPrim (T.unpack . untoken) updatePos matcher - updateState $ \st -> st{ sExpanded = False } - return res + updateState $ \st -> + if sEnableWithRaw st + then st{ sRawTokens = IntMap.map (res:) $ sRawTokens st } + else st + return $! res where matcher t | f t = Just t | otherwise = Nothing updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos updatePos _spos _ (Tok pos _ _ : _) = pos - updatePos spos _ [] = incSourceColumn spos 1 + updatePos spos (Tok _ _ t) [] = incSourceColumn spos (T.length t) doMacros :: PandocMonad m => LP m () doMacros = do - expanded <- sExpanded <$> getState - verbatimMode <- sVerbatimMode <$> getState - unless (expanded || verbatimMode) $ do - getInput >>= doMacros' 1 >>= setInput - updateState $ \st -> st{ sExpanded = True } + st <- getState + unless (sVerbatimMode st) $ + getInput >>= doMacros' 1 >>= setInput doMacros' :: PandocMonad m => Int -> [Tok] -> LP m [Tok] doMacros' n inp = case inp of Tok spos (CtrlSeq "begin") _ : Tok _ Symbol "{" : Tok _ Word name : Tok _ Symbol "}" : ts - -> handleMacros n spos name ts + -> handleMacros n spos name ts <|> return inp Tok spos (CtrlSeq "end") _ : Tok _ Symbol "{" : Tok _ Word name : Tok _ Symbol "}" : ts - -> handleMacros n spos ("end" <> name) ts + -> handleMacros n spos ("end" <> name) ts <|> return inp Tok _ (CtrlSeq "expandafter") _ : t : ts -> combineTok t <$> doMacros' n ts Tok spos (CtrlSeq name) _ : ts - -> handleMacros n spos name ts + -> handleMacros n spos name ts <|> return inp _ -> return inp - <|> return inp where combineTok (Tok spos (CtrlSeq name) x) (Tok _ Word w : ts) @@ -482,7 +554,7 @@ doMacros' n inp = $ throwError $ PandocMacroLoop name macros <- sMacros <$> getState case M.lookup name macros of - Nothing -> mzero + Nothing -> trySpecialMacro name ts Just (Macro expansionPoint argspecs optarg newtoks) -> do let getargs' = do args <- @@ -510,6 +582,41 @@ doMacros' n inp = ExpandWhenUsed -> doMacros' (n' + 1) result ExpandWhenDefined -> return result +-- | Certain macros do low-level tex manipulations that can't +-- be represented in our Macro type, so we handle them here. +trySpecialMacro :: PandocMonad m => Text -> [Tok] -> LP m [Tok] +trySpecialMacro "xspace" ts = do + ts' <- doMacros' 1 ts + case ts' of + Tok pos Word t : _ + | startsWithAlphaNum t -> return $ Tok pos Spaces " " : ts' + _ -> return ts' +trySpecialMacro "iftrue" ts = handleIf True ts +trySpecialMacro "iffalse" ts = handleIf False ts +trySpecialMacro _ _ = mzero + +handleIf :: PandocMonad m => Bool -> [Tok] -> LP m [Tok] +handleIf b ts = do + res' <- lift $ runParserT (ifParser b) defaultLaTeXState "tokens" ts + case res' of + Left _ -> Prelude.fail "Could not parse conditional" + Right ts' -> return ts' + +ifParser :: PandocMonad m => Bool -> LP m [Tok] +ifParser b = do + ifToks <- many (notFollowedBy (controlSeq "else" <|> controlSeq "fi") + *> anyTok) + elseToks <- (controlSeq "else" >> manyTill anyTok (controlSeq "fi")) + <|> ([] <$ controlSeq "fi") + rest <- getInput + return $ (if b then ifToks else elseToks) ++ rest + +startsWithAlphaNum :: Text -> Bool +startsWithAlphaNum t = + case T.uncons t of + Just (c, _) | isAlphaNum c -> True + _ -> False + setpos :: SourcePos -> Tok -> Tok setpos spos (Tok _ tt txt) = Tok spos tt txt @@ -592,18 +699,22 @@ isCommentTok _ = False anyTok :: PandocMonad m => LP m Tok anyTok = satisfyTok (const True) +singleCharTok :: PandocMonad m => LP m Tok +singleCharTok = + satisfyTok $ \case + Tok _ Word t -> T.length t == 1 + Tok _ Symbol t -> not (T.any (`Set.member` specialChars) t) + _ -> False + singleChar :: PandocMonad m => LP m Tok -singleChar = try $ do - Tok pos toktype t <- satisfyTok (tokTypeIn [Word, Symbol]) - guard $ not $ toktype == Symbol && - T.any (`Set.member` specialChars) t - if T.length t > 1 - then do - let (t1, t2) = (T.take 1 t, T.drop 1 t) - inp <- getInput - setInput $ Tok (incSourceColumn pos 1) toktype t2 : inp - return $ Tok pos toktype t1 - else return $ Tok pos toktype t +singleChar = singleCharTok <|> singleCharFromWord + where + singleCharFromWord = do + Tok pos toktype t <- disablingWithRaw $ satisfyTok isWordTok + let (t1, t2) = (T.take 1 t, T.drop 1 t) + inp <- getInput + setInput $ Tok pos toktype t1 : Tok (incSourceColumn pos 1) toktype t2 : inp + anyTok specialChars :: Set.Set Char specialChars = Set.fromList "#$%&~_^\\{}" @@ -646,28 +757,25 @@ grouped parser = try $ do -- {{a,b}} should be parsed the same as {a,b} try (grouped parser <* egroup) <|> (mconcat <$> manyTill parser egroup) -braced' :: PandocMonad m => LP m Tok -> Int -> LP m [Tok] -braced' getTok n = - handleEgroup <|> handleBgroup <|> handleOther - where handleEgroup = do - t <- symbol '}' - if n == 1 - then return [] - else (t:) <$> braced' getTok (n - 1) - handleBgroup = do - t <- symbol '{' - (t:) <$> braced' getTok (n + 1) - handleOther = do - t <- getTok - (t:) <$> braced' getTok n +braced' :: PandocMonad m => LP m Tok -> LP m [Tok] +braced' getTok = symbol '{' *> go (1 :: Int) + where + go n = do + t <- getTok + case t of + Tok _ Symbol "}" + | n > 1 -> (t:) <$> go (n - 1) + | otherwise -> return [] + Tok _ Symbol "{" -> (t:) <$> go (n + 1) + _ -> (t:) <$> go n braced :: PandocMonad m => LP m [Tok] -braced = symbol '{' *> braced' anyTok 1 +braced = braced' anyTok -- URLs require special handling, because they can contain % -- characters. So we retonenize comments as we go... bracedUrl :: PandocMonad m => LP m [Tok] -bracedUrl = bgroup *> braced' (retokenizeComment >> anyTok) 1 +bracedUrl = braced' (retokenizeComment >> anyTok) -- For handling URLs, which allow literal % characters... retokenizeComment :: PandocMonad m => LP m () @@ -723,16 +831,29 @@ ignore raw = do withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok]) withRaw parser = do - inp <- getInput + rawTokensMap <- sRawTokens <$> getState + let key = case IntMap.lookupMax rawTokensMap of + Nothing -> 0 + Just (n,_) -> n + 1 + -- insert empty list at key + updateState $ \st -> st{ sRawTokens = + IntMap.insert key [] $ sRawTokens st } result <- parser - nxtpos <- option Nothing ((\(Tok pos' _ _) -> Just pos') <$> lookAhead anyTok) - let raw = takeWhile (\(Tok pos _ _) -> maybe True - (\p -> sourceName p /= sourceName pos || pos < p) nxtpos) inp + mbRevToks <- IntMap.lookup key . sRawTokens <$> getState + raw <- case mbRevToks of + Just revtoks -> do + updateState $ \st -> st{ sRawTokens = + IntMap.delete key $ sRawTokens st} + return $ reverse revtoks + Nothing -> + throwError $ PandocShouldNeverHappenError $ + "sRawTokens has nothing at key " <> T.pack (show key) return (result, raw) keyval :: PandocMonad m => LP m (Text, Text) keyval = try $ do - Tok _ Word key <- satisfyTok isWordTok + key <- untokenize <$> many1 (notFollowedBy (symbol '=') >> + (symbol '-' <|> symbol '_' <|> satisfyTok isWordTok)) sp val <- option mempty $ do symbol '=' @@ -792,7 +913,7 @@ getRawCommand name txt = do (_, rawargs) <- withRaw $ case name of "write" -> do - void $ satisfyTok isWordTok -- digits + void $ many $ satisfyTok isDigitTok -- digits void braced "titleformat" -> do void braced @@ -807,6 +928,10 @@ getRawCommand name txt = do void $ many braced return $ txt <> untokenize rawargs +isDigitTok :: Tok -> Bool +isDigitTok (Tok _ Word t) = T.all isDigit t +isDigitTok _ = False + skipopts :: PandocMonad m => LP m () skipopts = skipMany (void overlaySpecification <|> void rawopt) @@ -874,3 +999,35 @@ getNextNumber getCurrentNum = do Just n -> [n, 1] Nothing -> [1] +label :: PandocMonad m => LP m () +label = do + controlSeq "label" + t <- braced + updateState $ \st -> st{ sLastLabel = Just $ untokenize t } + +setCaption :: PandocMonad m => LP m Inlines -> LP m () +setCaption inline = try $ do + skipopts + ils <- tokWith inline + optional $ try $ spaces *> label + updateState $ \st -> st{ sCaption = Just ils } + +resetCaption :: PandocMonad m => LP m () +resetCaption = updateState $ \st -> st{ sCaption = Nothing + , sLastLabel = Nothing } + +env :: PandocMonad m => Text -> LP m a -> LP m a +env name p = p <* end_ name + +tokWith :: PandocMonad m => LP m Inlines -> LP m Inlines +tokWith inlineParser = try $ spaces >> + grouped inlineParser + <|> (lookAhead anyControlSeq >> inlineParser) + <|> singleChar' + where singleChar' = do + Tok _ _ t <- singleChar + return $ str t + +addMeta :: PandocMonad m => ToMetaValue a => Text -> a -> LP m () +addMeta field val = updateState $ \st -> + st{ sMeta = addMetaField field val $ sMeta st } diff --git a/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs b/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs index db9c276e7..b8bf0ce7f 100644 --- a/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs +++ b/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs @@ -1,12 +1,7 @@ +{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE OverloadedStrings #-} module Text.Pandoc.Readers.LaTeX.SIunitx - ( dosi - , doSI - , doSIrange - , doSInum - , doSInumlist - , doSIang - ) + ( siunitxCommands ) where import Text.Pandoc.Builder import Text.Pandoc.Readers.LaTeX.Parsing @@ -15,14 +10,32 @@ import Text.Pandoc.Class import Text.Pandoc.Parsing hiding (blankline, mathDisplay, mathInline, optional, space, spaces, withRaw, (<|>)) import Control.Applicative ((<|>)) +import Control.Monad (void) import qualified Data.Map as M import Data.Char (isDigit) import Data.Text (Text) import qualified Data.Text as T import Data.List (intersperse) +import qualified Data.Sequence as Seq +import Text.Pandoc.Walk (walk) + +siunitxCommands :: PandocMonad m + => LP m Inlines -> M.Map Text (LP m Inlines) +siunitxCommands tok = M.fromList + [ ("si", dosi tok) + , ("SI", doSI tok) + , ("SIrange", doSIrange True tok) + , ("numrange", doSIrange False tok) + , ("numlist", doSInumlist) + , ("SIlist", doSIlist tok) + , ("num", doSInum) + , ("ang", doSIang) + ] dosi :: PandocMonad m => LP m Inlines -> LP m Inlines -dosi tok = grouped (siUnit tok) <|> siUnit tok +dosi tok = do + options <- option [] keyvals + grouped (siUnit options tok) <|> siUnit options tok -- converts e.g. \SI{1}[\$]{} to "$ 1" or \SI{1}{\euro} to "1 €" doSI :: PandocMonad m => LP m Inlines -> LP m Inlines @@ -57,23 +70,50 @@ doSInumlist = do mconcat (intersperse (str "," <> space) (init xs)) <> text ", & " <> last xs +doSIlist :: PandocMonad m => LP m Inlines -> LP m Inlines +doSIlist tok = do + options <- option [] keyvals + nums <- map tonum . T.splitOn ";" . untokenize <$> braced + unit <- grouped (siUnit options tok) <|> siUnit options tok + let xs = map (<> (str "\xa0" <> unit)) nums + case xs of + [] -> return mempty + [x] -> return x + _ -> return $ + mconcat (intersperse (str "," <> space) (init xs)) <> + text ", & " <> last xs + parseNum :: Parser Text () Inlines parseNum = (mconcat <$> many parseNumPart) <* eof +minus :: Text +minus = "\x2212" + +hyphenToMinus :: Inline -> Inline +hyphenToMinus (Str t) = Str (T.replace "-" minus t) +hyphenToMinus x = x + parseNumPart :: Parser Text () Inlines parseNumPart = parseDecimalNum <|> parseComma <|> parsePlusMinus <|> + parsePM <|> parseI <|> parseExp <|> parseX <|> parseSpace where - parseDecimalNum = do - pref <- option mempty $ (mempty <$ char '+') <|> ("\x2212" <$ char '-') - basenum <- (pref <>) . T.pack - <$> many1 (satisfy (\c -> isDigit c || c == '.')) + parseDecimalNum, parsePlusMinus, parsePM, + parseComma, parseI, parseX, + parseExp, parseSpace :: Parser Text () Inlines + parseDecimalNum = try $ do + pref <- option mempty $ (mempty <$ char '+') <|> (minus <$ char '-') + basenum' <- many1 (satisfy (\c -> isDigit c || c == '.')) + let basenum = pref <> T.pack + (case basenum' of + '.':_ -> '0':basenum' + _ -> basenum') uncertainty <- option mempty $ T.pack <$> parseParens if T.null uncertainty then return $ str basenum @@ -91,6 +131,7 @@ parseNumPart = | otherwise -> "." <> t parseComma = str "." <$ char ',' parsePlusMinus = str "\xa0\xb1\xa0" <$ try (string "+-") + parsePM = str "\xa0\xb1\xa0" <$ try (string "\\pm") parseParens = char '(' *> many1 (satisfy (\c -> isDigit c || c == '.')) <* char ')' parseI = str "i" <$ char 'i' @@ -103,11 +144,14 @@ doSIang :: PandocMonad m => LP m Inlines doSIang = do skipopts ps <- T.splitOn ";" . untokenize <$> braced + let dropPlus t = case T.uncons t of + Just ('+',t') -> t' + _ -> t case ps ++ repeat "" of (d:m:s:_) -> return $ - (if T.null d then mempty else str d <> str "\xb0") <> - (if T.null m then mempty else str m <> str "\x2032") <> - (if T.null s then mempty else str s <> str "\x2033") + (if T.null d then mempty else str (dropPlus d) <> str "\xb0") <> + (if T.null m then mempty else str (dropPlus m) <> str "\x2032") <> + (if T.null s then mempty else str (dropPlus s) <> str "\x2033") _ -> return mempty -- converts e.g. \SIrange{100}{200}{\ms} to "100 ms--200 ms" @@ -136,40 +180,99 @@ doSIrange includeUnits tok = do emptyOr160 :: Inlines -> Inlines emptyOr160 x = if x == mempty then x else str "\160" -siUnit :: PandocMonad m => LP m Inlines -> LP m Inlines -siUnit tok = try (do - Tok _ (CtrlSeq name) _ <- anyControlSeq - case name of - "square" -> do - unit <- siUnit tok - return $ unit <> superscript "2" - "cubic" -> do - unit <- siUnit tok - return $ unit <> superscript "3" - "raisetothe" -> do - n <- tok - unit <- siUnit tok - return $ unit <> superscript n - _ -> - case M.lookup name siUnitMap of - Just il -> - option il $ - choice - [ (il <> superscript "2") <$ controlSeq "squared" - , (il <> superscript "3") <$ controlSeq "cubed" - , (\n -> il <> superscript n) <$> (controlSeq "tothe" *> tok) - ] - Nothing -> fail "not an siunit unit command") - <|> (lookAhead anyControlSeq >> tok) - <|> (do Tok _ Word t <- satisfyTok isWordTok - return $ str t) - <|> (symbol '^' *> (superscript <$> tok)) - <|> (symbol '_' *> (subscript <$> tok)) - <|> ("\xa0" <$ symbol '.') - <|> ("\xa0" <$ symbol '~') - <|> tok - <|> (do Tok _ _ t <- anyTok - return (str t)) +siUnit :: forall m. PandocMonad m => [(Text,Text)] -> LP m Inlines -> LP m Inlines +siUnit options tok = mconcat . intersperse (str "\xa0") <$> many1 siUnitPart + where + siUnitPart :: LP m Inlines + siUnitPart = try $ do + skipMany (void (symbol '.') <|> void (symbol '~') <|> spaces1) + x <- ((siPrefix <*> siBase) + <|> (do u <- siBase <|> tok + option u $ siSuffix <*> pure u)) + option x (siInfix x) + siInfix :: Inlines -> LP m Inlines + siInfix u1 = try $ + (do _ <- controlSeq "per" + u2 <- siUnitPart + let useSlash = lookup "per-mode" options == Just "symbol" + if useSlash + then return (u1 <> str "/" <> u2) + else return (u1 <> str "\xa0" <> negateExponent u2)) + <|> (do _ <- symbol '/' + u2 <- siUnitPart + return (u1 <> str "/" <> u2)) + siPrefix :: LP m (Inlines -> Inlines) + siPrefix = + (do _ <- controlSeq "square" + skipopts + return (<> superscript "2")) + <|> (do _ <- controlSeq "cubic" + skipopts + return (<> superscript "3")) + <|> (do _ <- controlSeq "raisetothe" + skipopts + n <- walk hyphenToMinus <$> tok + return (<> superscript n)) + siSuffix :: LP m (Inlines -> Inlines) + siSuffix = + (do _ <- controlSeq "squared" + skipopts + return (<> superscript "2")) + <|> (do _ <- controlSeq "cubed" + skipopts + return (<> superscript "3")) + <|> (do _ <- controlSeq "tothe" + skipopts + n <- walk hyphenToMinus <$> tok + return (<> superscript n)) + <|> (symbol '^' *> (do n <- walk hyphenToMinus <$> tok + return (<> superscript n))) + <|> (symbol '_' *> (do n <- walk hyphenToMinus <$> tok + return (<> subscript n))) + negateExponent :: Inlines -> Inlines + negateExponent ils = + case Seq.viewr (unMany ils) of + xs Seq.:> Superscript ss -> (Many xs) <> + superscript (str minus <> fromList ss) + _ -> ils <> superscript (str (minus <> "1")) + siBase :: LP m Inlines + siBase = + ((try + (do Tok _ (CtrlSeq name) _ <- anyControlSeq + case M.lookup name siUnitModifierMap of + Just il -> (il <>) <$> siBase + Nothing -> + case M.lookup name siUnitMap of + Just il -> pure il + Nothing -> fail "not a unit command")) + <|> (do Tok _ Word t <- satisfyTok isWordTok + return $ str t) + ) + +siUnitModifierMap :: M.Map Text Inlines +siUnitModifierMap = M.fromList + [ ("atto", str "a") + , ("centi", str "c") + , ("deca", str "d") + , ("deci", str "d") + , ("deka", str "d") + , ("exa", str "E") + , ("femto", str "f") + , ("giga", str "G") + , ("hecto", str "h") + , ("kilo", str "k") + , ("mega", str "M") + , ("micro", str "μ") + , ("milli", str "m") + , ("nano", str "n") + , ("peta", str "P") + , ("pico", str "p") + , ("tera", str "T") + , ("yocto", str "y") + , ("yotta", str "Y") + , ("zepto", str "z") + , ("zetta", str "Z") + ] siUnitMap :: M.Map Text Inlines siUnitMap = M.fromList @@ -269,7 +372,6 @@ siUnitMap = M.fromList , ("arcsecond", str "″") , ("astronomicalunit", str "ua") , ("atomicmassunit", str "u") - , ("atto", str "a") , ("bar", str "bar") , ("barn", str "b") , ("becquerel", str "Bq") @@ -277,51 +379,38 @@ siUnitMap = M.fromList , ("bohr", emph (str "a") <> subscript (str "0")) , ("candela", str "cd") , ("celsius", str "°C") - , ("centi", str "c") , ("clight", emph (str "c") <> subscript (str "0")) , ("coulomb", str "C") , ("dalton", str "Da") , ("day", str "d") - , ("deca", str "d") - , ("deci", str "d") , ("decibel", str "db") , ("degreeCelsius",str "°C") , ("degree", str "°") - , ("deka", str "d") , ("electronmass", emph (str "m") <> subscript (str "e")) , ("electronvolt", str "eV") , ("elementarycharge", emph (str "e")) - , ("exa", str "E") , ("farad", str "F") - , ("femto", str "f") - , ("giga", str "G") , ("gram", str "g") , ("gray", str "Gy") , ("hartree", emph (str "E") <> subscript (str "h")) , ("hectare", str "ha") - , ("hecto", str "h") , ("henry", str "H") , ("hertz", str "Hz") , ("hour", str "h") , ("joule", str "J") , ("katal", str "kat") , ("kelvin", str "K") - , ("kilo", str "k") , ("kilogram", str "kg") , ("knot", str "kn") , ("liter", str "L") , ("litre", str "l") , ("lumen", str "lm") , ("lux", str "lx") - , ("mega", str "M") , ("meter", str "m") , ("metre", str "m") - , ("micro", str "μ") - , ("milli", str "m") , ("minute", str "min") , ("mmHg", str "mmHg") , ("mole", str "mol") - , ("nano", str "n") , ("nauticalmile", str "M") , ("neper", str "Np") , ("newton", str "N") @@ -329,25 +418,17 @@ siUnitMap = M.fromList , ("Pa", str "Pa") , ("pascal", str "Pa") , ("percent", str "%") - , ("per", str "/") - , ("peta", str "P") - , ("pico", str "p") , ("planckbar", emph (str "\x210f")) , ("radian", str "rad") , ("second", str "s") , ("siemens", str "S") , ("sievert", str "Sv") , ("steradian", str "sr") - , ("tera", str "T") , ("tesla", str "T") , ("tonne", str "t") , ("volt", str "V") , ("watt", str "W") , ("weber", str "Wb") - , ("yocto", str "y") - , ("yotta", str "Y") - , ("zepto", str "z") - , ("zetta", str "Z") ] diff --git a/src/Text/Pandoc/Readers/LaTeX/Table.hs b/src/Text/Pandoc/Readers/LaTeX/Table.hs new file mode 100644 index 000000000..f56728fe1 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Table.hs @@ -0,0 +1,379 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE OverloadedStrings #-} +module Text.Pandoc.Readers.LaTeX.Table + ( tableEnvironments ) +where + +import Data.Functor (($>)) +import Text.Pandoc.Class +import Text.Pandoc.Readers.LaTeX.Parsing +import Text.Pandoc.Readers.LaTeX.Types +import Text.Pandoc.Builder as B +import qualified Data.Map as M +import Data.Text (Text) +import Data.Maybe (fromMaybe) +import qualified Data.Text as T +import Control.Applicative ((<|>), optional, many) +import Control.Monad (when, void) +import Text.Pandoc.Shared (safeRead, trim) +import Text.Pandoc.Logging (LogMessage(SkippedContent)) +import Text.Pandoc.Walk (walkM) +import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline, + optional, space, spaces, withRaw, (<|>)) + +tableEnvironments :: PandocMonad m + => LP m Blocks + -> LP m Inlines + -> M.Map Text (LP m Blocks) +tableEnvironments blocks inline = + M.fromList + [ ("longtable", env "longtable" $ + resetCaption *> + simpTable blocks inline "longtable" False >>= addTableCaption) + , ("table", env "table" $ + skipopts *> resetCaption *> blocks >>= addTableCaption) + , ("tabular*", env "tabular*" $ simpTable blocks inline "tabular*" True) + , ("tabularx", env "tabularx" $ simpTable blocks inline "tabularx" True) + , ("tabular", env "tabular" $ simpTable blocks inline "tabular" False) + ] + +hline :: PandocMonad m => LP m () +hline = try $ do + spaces + controlSeq "hline" <|> + (controlSeq "cline" <* braced) <|> + -- booktabs rules: + controlSeq "toprule" <|> + controlSeq "bottomrule" <|> + controlSeq "midrule" <|> + controlSeq "endhead" <|> + controlSeq "endfirsthead" + spaces + optional rawopt + return () + +lbreak :: PandocMonad m => LP m Tok +lbreak = (controlSeq "\\" <|> controlSeq "tabularnewline") + <* skipopts <* spaces + +amp :: PandocMonad m => LP m Tok +amp = symbol '&' + +-- Split a Word into individual Symbols (for parseAligns) +splitWordTok :: PandocMonad m => LP m () +splitWordTok = do + inp <- getInput + case inp of + (Tok spos Word t : rest) -> + setInput $ map (Tok spos Symbol . T.singleton) (T.unpack t) <> rest + _ -> return () + +parseAligns :: PandocMonad m => LP m [(Alignment, ColWidth, ([Tok], [Tok]))] +parseAligns = try $ do + let maybeBar = skipMany + (try $ sp *> (() <$ symbol '|' <|> () <$ (symbol '@' >> braced))) + let cAlign = AlignCenter <$ symbol 'c' + let lAlign = AlignLeft <$ symbol 'l' + let rAlign = AlignRight <$ symbol 'r' + let parAlign = AlignLeft <$ symbol 'p' + -- aligns from tabularx + let xAlign = AlignLeft <$ symbol 'X' + let mAlign = AlignLeft <$ symbol 'm' + let bAlign = AlignLeft <$ symbol 'b' + let alignChar = splitWordTok *> ( cAlign <|> lAlign <|> rAlign <|> parAlign + <|> xAlign <|> mAlign <|> bAlign ) + let alignPrefix = symbol '>' >> braced + let alignSuffix = symbol '<' >> braced + let colWidth = try $ do + symbol '{' + ds <- trim . untokenize <$> manyTill anyTok (controlSeq "linewidth") + spaces + symbol '}' + return $ safeRead ds + let alignSpec = do + pref <- option [] alignPrefix + spaces + al <- alignChar + width <- colWidth <|> option Nothing (do s <- untokenize <$> braced + pos <- getPosition + report $ SkippedContent s pos + return Nothing) + spaces + suff <- option [] alignSuffix + return (al, width, (pref, suff)) + let starAlign = do -- '*{2}{r}' == 'rr', we just expand like a macro + symbol '*' + spaces + ds <- trim . untokenize <$> braced + spaces + spec <- braced + case safeRead ds of + Just n -> + getInput >>= setInput . (mconcat (replicate n spec) ++) + Nothing -> Prelude.fail $ "Could not parse " <> T.unpack ds <> " as number" + bgroup + spaces + maybeBar + aligns' <- many $ try $ spaces >> optional starAlign >> + (alignSpec <* maybeBar) + spaces + egroup + spaces + return $ map toSpec aligns' + where + toColWidth (Just w) | w > 0 = ColWidth w + toColWidth _ = ColWidthDefault + toSpec (x, y, z) = (x, toColWidth y, z) + +-- N.B. this parser returns a Row that may have erroneous empty cells +-- in it. See the note above fixTableHead for details. +parseTableRow :: PandocMonad m + => LP m Blocks -- ^ block parser + -> LP m Inlines -- ^ inline parser + -> Text -- ^ table environment name + -> [([Tok], [Tok])] -- ^ pref/suffixes + -> LP m Row +parseTableRow blocks inline envname prefsufs = do + notFollowedBy (spaces *> end_ envname) + -- contexts that can contain & that is not colsep: + let canContainAmp (Tok _ (CtrlSeq "begin") _) = True + canContainAmp (Tok _ (CtrlSeq "verb") _) = True + canContainAmp (Tok _ (CtrlSeq "Verb") _) = True + canContainAmp _ = False + -- add prefixes and suffixes in token stream: + let celltoks (pref, suff) = do + prefpos <- getPosition + contents <- mconcat <$> + many ( snd <$> withRaw + ((lookAhead (controlSeq "parbox") >> + void blocks) -- #5711 + <|> + (lookAhead (satisfyTok canContainAmp) >> void inline) + <|> + (lookAhead (symbol '$') >> void inline)) + <|> + (do notFollowedBy + (() <$ amp <|> () <$ lbreak <|> end_ envname) + count 1 anyTok) ) + + suffpos <- getPosition + option [] (count 1 amp) + return $ map (setpos prefpos) pref ++ contents ++ map (setpos suffpos) suff + rawcells <- mapM celltoks prefsufs + cells <- mapM (parseFromToks (parseTableCell blocks)) rawcells + spaces + return $ Row nullAttr cells + +parseTableCell :: PandocMonad m => LP m Blocks -> LP m Cell +parseTableCell blocks = do + spaces + updateState $ \st -> st{ sInTableCell = True } + cell' <- multicolumnCell blocks + <|> multirowCell blocks + <|> parseSimpleCell + <|> parseEmptyCell + updateState $ \st -> st{ sInTableCell = False } + spaces + return cell' + where + -- The parsing of empty cells is important in LaTeX, especially when dealing + -- with multirow/multicolumn. See #6603. + parseEmptyCell = spaces $> emptyCell + parseSimpleCell = simpleCell <$> (plainify <$> blocks) + + +cellAlignment :: PandocMonad m => LP m Alignment +cellAlignment = skipMany (symbol '|') *> alignment <* skipMany (symbol '|') + where + alignment = do + c <- untoken <$> singleChar + return $ case c of + "l" -> AlignLeft + "r" -> AlignRight + "c" -> AlignCenter + "*" -> AlignDefault + _ -> AlignDefault + +plainify :: Blocks -> Blocks +plainify bs = case toList bs of + [Para ils] -> plain (fromList ils) + _ -> bs + +multirowCell :: PandocMonad m => LP m Blocks -> LP m Cell +multirowCell blocks = controlSeq "multirow" >> do + -- Full prototype for \multirow macro is: + -- \multirow[vpos]{nrows}[bigstruts]{width}[vmove]{text} + -- However, everything except `nrows` and `text` make + -- sense in the context of the Pandoc AST + _ <- optional $ symbol '[' *> cellAlignment <* symbol ']' -- vertical position + nrows <- fmap (fromMaybe 1 . safeRead . untokenize) braced + _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']') -- bigstrut-related + _ <- symbol '{' *> manyTill anyTok (symbol '}') -- Cell width + _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']') -- Length used for fine-tuning + content <- symbol '{' *> (plainify <$> blocks) <* symbol '}' + return $ cell AlignDefault (RowSpan nrows) (ColSpan 1) content + +multicolumnCell :: PandocMonad m => LP m Blocks -> LP m Cell +multicolumnCell blocks = controlSeq "multicolumn" >> do + span' <- fmap (fromMaybe 1 . safeRead . untokenize) braced + alignment <- symbol '{' *> cellAlignment <* symbol '}' + + let singleCell = do + content <- plainify <$> blocks + return $ cell alignment (RowSpan 1) (ColSpan span') content + + -- Two possible contents: either a \multirow cell, or content. + -- E.g. \multicol{1}{c}{\multirow{2}{1em}{content}} + -- Note that a \multirow cell can be nested in a \multicolumn, + -- but not the other way around. See #6603 + let nestedCell = do + (Cell _ _ (RowSpan rs) _ bs) <- multirowCell blocks + return $ cell + alignment + (RowSpan rs) + (ColSpan span') + (fromList bs) + + symbol '{' *> (nestedCell <|> singleCell) <* symbol '}' + +-- LaTeX tables are stored with empty cells underneath multirow cells +-- denoting the grid spaces taken up by them. More specifically, if a +-- cell spans m rows, then it will overwrite all the cells in the +-- columns it spans for (m-1) rows underneath it, requiring padding +-- cells in these places. These padding cells need to be removed for +-- proper table reading. See #6603. +-- +-- These fixTable functions do not otherwise fix up malformed +-- input tables: that is left to the table builder. +fixTableHead :: TableHead -> TableHead +fixTableHead (TableHead attr rows) = TableHead attr rows' + where + rows' = fixTableRows rows + +fixTableBody :: TableBody -> TableBody +fixTableBody (TableBody attr rhc th tb) + = TableBody attr rhc th' tb' + where + th' = fixTableRows th + tb' = fixTableRows tb + +fixTableRows :: [Row] -> [Row] +fixTableRows = fixTableRows' $ repeat Nothing + where + fixTableRows' oldHang (Row attr cells : rs) + = let (newHang, cells') = fixTableRow oldHang cells + rs' = fixTableRows' newHang rs + in Row attr cells' : rs' + fixTableRows' _ [] = [] + +-- The overhang is represented as Just (relative cell dimensions) or +-- Nothing for an empty grid space. +fixTableRow :: [Maybe (ColSpan, RowSpan)] -> [Cell] -> ([Maybe (ColSpan, RowSpan)], [Cell]) +fixTableRow oldHang cells + -- If there's overhang, drop cells until their total width meets the + -- width of the occupied grid spaces (or we run out) + | (n, prefHang, restHang) <- splitHang oldHang + , n > 0 + = let cells' = dropToWidth getCellW n cells + (restHang', cells'') = fixTableRow restHang cells' + in (prefHang restHang', cells'') + -- Otherwise record the overhang of a pending cell and fix the rest + -- of the row + | c@(Cell _ _ h w _):cells' <- cells + = let h' = max 1 h + w' = max 1 w + oldHang' = dropToWidth getHangW w' oldHang + (newHang, cells'') = fixTableRow oldHang' cells' + in (toHang w' h' <> newHang, c : cells'') + | otherwise + = (oldHang, []) + where + getCellW (Cell _ _ _ w _) = w + getHangW = maybe 1 fst + getCS (ColSpan n) = n + + toHang c r + | r > 1 = [Just (c, r)] + | otherwise = replicate (getCS c) Nothing + + -- Take the prefix of the overhang list representing filled grid + -- spaces. Also return the remainder and the length of this prefix. + splitHang = splitHang' 0 id + + splitHang' !n l (Just (c, r):xs) + = splitHang' (n + c) (l . (toHang c (r-1) ++)) xs + splitHang' n l xs = (n, l, xs) + + -- Drop list items until the total width of the dropped items + -- exceeds the passed width. + dropToWidth _ n l | n < 1 = l + dropToWidth wproj n (c:cs) = dropToWidth wproj (n - wproj c) cs + dropToWidth _ _ [] = [] + +simpTable :: PandocMonad m + => LP m Blocks + -> LP m Inlines + -> Text + -> Bool + -> LP m Blocks +simpTable blocks inline envname hasWidthParameter = try $ do + when hasWidthParameter $ () <$ tokWith inline + skipopts + colspecs <- parseAligns + let (aligns, widths, prefsufs) = unzip3 colspecs + optional $ controlSeq "caption" *> setCaption inline + spaces + optional label + spaces + optional lbreak + spaces + skipMany hline + spaces + header' <- option [] . try . fmap (:[]) $ + parseTableRow blocks inline envname prefsufs <* + lbreak <* many1 hline + spaces + rows <- sepEndBy (parseTableRow blocks inline envname prefsufs) + (lbreak <* optional (skipMany hline)) + spaces + optional $ controlSeq "caption" *> setCaption inline + spaces + optional label + spaces + optional lbreak + spaces + lookAhead $ controlSeq "end" -- make sure we're at end + let th = fixTableHead $ TableHead nullAttr header' + let tbs = [fixTableBody $ TableBody nullAttr 0 [] rows] + let tf = TableFoot nullAttr [] + return $ table emptyCaption (zip aligns widths) th tbs tf + +addTableCaption :: PandocMonad m => Blocks -> LP m Blocks +addTableCaption = walkM go + where go (Table attr c spec th tb tf) = do + st <- getState + let mblabel = sLastLabel st + capt <- case (sCaption st, mblabel) of + (Just ils, Nothing) -> return $ caption Nothing (plain ils) + (Just ils, Just lab) -> do + num <- getNextNumber sLastTableNum + setState + st{ sLastTableNum = num + , sLabels = M.insert lab + [Str (renderDottedNum num)] + (sLabels st) } + return $ caption Nothing (plain ils) -- add number?? + (Nothing, _) -> return c + let attr' = case (attr, mblabel) of + ((_,classes,kvs), Just ident) -> + (ident,classes,kvs) + _ -> attr + return $ addAttrDiv attr' $ Table nullAttr capt spec th tb tf + go x = return x + +-- TODO: For now we add a Div to contain table attributes, since +-- most writers don't do anything yet with attributes on Table. +-- This can be removed when that changes. +addAttrDiv :: Attr -> Block -> Block +addAttrDiv ("",[],[]) b = b +addAttrDiv attr b = Div attr [b] diff --git a/src/Text/Pandoc/Readers/LaTeX/Types.hs b/src/Text/Pandoc/Readers/LaTeX/Types.hs index a017a2afb..c20b72bc5 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Types.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Types.hs @@ -1,6 +1,7 @@ +{-# LANGUAGE FlexibleInstances #-} {- | Module : Text.Pandoc.Readers.LaTeX.Types - Copyright : Copyright (C) 2017-2020 John MacFarlane + Copyright : Copyright (C) 2017-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -18,7 +19,9 @@ module Text.Pandoc.Readers.LaTeX.Types ( Tok(..) ) where import Data.Text (Text) -import Text.Parsec.Pos (SourcePos) +import Text.Parsec.Pos (SourcePos, sourceName) +import Text.Pandoc.Sources +import Data.List (groupBy) data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment | Esc1 | Esc2 | Arg Int @@ -27,6 +30,16 @@ data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment | data Tok = Tok SourcePos TokType Text deriving (Eq, Ord, Show) +instance ToSources [Tok] where + toSources = Sources + . map (\ts -> case ts of + Tok p _ _ : _ -> (p, mconcat $ map tokToText ts) + _ -> error "toSources [Tok] encountered empty group") + . groupBy (\(Tok p1 _ _) (Tok p2 _ _) -> sourceName p1 == sourceName p2) + +tokToText :: Tok -> Text +tokToText (Tok _ _ t) = t + data ExpansionPoint = ExpandWhenDefined | ExpandWhenUsed deriving (Eq, Ord, Show) diff --git a/src/Text/Pandoc/Readers/Man.hs b/src/Text/Pandoc/Readers/Man.hs index 21b8feaab..1141af66f 100644 --- a/src/Text/Pandoc/Readers/Man.hs +++ b/src/Text/Pandoc/Readers/Man.hs @@ -20,7 +20,7 @@ import Control.Monad (liftM, mzero, guard, void) import Control.Monad.Trans (lift) import Control.Monad.Except (throwError) import Data.Maybe (catMaybes, isJust) -import Data.List (intersperse, intercalate) +import Data.List (intersperse) import qualified Data.Text as T import Text.Pandoc.Builder as B import Text.Pandoc.Class.PandocMonad (PandocMonad(..), report) @@ -29,9 +29,8 @@ import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Walk (query) -import Text.Pandoc.Shared (crFilter, mapLeft) +import Text.Pandoc.Shared (mapLeft) import Text.Pandoc.Readers.Roff -- TODO explicit imports -import Text.Parsec hiding (tokenPrim) import qualified Text.Parsec as Parsec import Text.Parsec.Pos (updatePosString) import qualified Data.Foldable as Foldable @@ -50,13 +49,20 @@ type ManParser m = ParserT [RoffToken] ManState m -- | Read man (troff) from an input string and return a Pandoc document. -readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc -readMan opts txt = do - tokenz <- lexRoff (initialPos "input") (crFilter txt) +readMan :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc +readMan opts s = do + let Sources inps = toSources s + tokenz <- mconcat <$> mapM (uncurry lexRoff) inps let state = def {readerOptions = opts} :: ManState + let fixError (PandocParsecError _ e) = PandocParsecError (Sources inps) e + fixError e = e eitherdoc <- readWithMTokens parseMan state (Foldable.toList . unRoffTokens $ tokenz) - either throwError return eitherdoc + either (throwError . fixError) return eitherdoc + readWithMTokens :: PandocMonad m => ParserT [RoffToken] ManState m a -- ^ parser @@ -64,9 +70,10 @@ readWithMTokens :: PandocMonad m -> [RoffToken] -- ^ input -> m (Either PandocError a) readWithMTokens parser state input = - let leftF = PandocParsecError . T.pack . intercalate "\n" $ show <$> input + let leftF = PandocParsecError mempty in mapLeft leftF `liftM` runParserT parser state "source" input + parseMan :: PandocMonad m => ManParser m Pandoc parseMan = do bs <- many parseBlock <* eof @@ -89,7 +96,7 @@ parseBlock = choice [ parseList parseTable :: PandocMonad m => ManParser m Blocks parseTable = do - modifyState $ \st -> st { tableCellsPlain = True } + updateState $ \st -> st { tableCellsPlain = True } let isTbl Tbl{} = True isTbl _ = False Tbl _opts rows pos <- msatisfy isTbl @@ -135,7 +142,7 @@ parseTable = do case res' of Left _ -> Prelude.fail "Could not parse table cell" Right x -> do - modifyState $ \s -> s{ tableCellsPlain = False } + updateState $ \s -> s{ tableCellsPlain = False } return x Right x -> return x @@ -222,7 +229,7 @@ parseTitle = do setMeta "section" (linePartsToInlines y) [x] -> setMeta "title" (linePartsToInlines x) [] -> id - modifyState $ \st -> st{ metadata = adjustMeta $ metadata st } + updateState $ \st -> st{ metadata = adjustMeta $ metadata st } return mempty linePartsToInlines :: [LinePart] -> Inlines diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 5888bf095..2dc7ddf52 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -4,7 +4,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.Markdown - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -21,15 +21,17 @@ module Text.Pandoc.Readers.Markdown ( import Control.Monad import Control.Monad.Except (throwError) import Data.Char (isAlphaNum, isPunctuation, isSpace) -import Data.List (transpose, elemIndex, sortOn) +import Data.List (transpose, elemIndex, sortOn, foldl') +import Data.List.NonEmpty (NonEmpty(..)) import qualified Data.Map as M import Data.Maybe import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T -import qualified Data.Text.Lazy as TL import qualified Data.ByteString.Lazy as BL -import System.FilePath (addExtension, takeExtension) +import System.FilePath (addExtension, takeExtension, takeDirectory) +import qualified System.FilePath.Windows as Windows +import qualified System.FilePath.Posix as Posix import Text.HTML.TagSoup hiding (Row) import Text.Pandoc.Builder (Blocks, Inlines) import qualified Text.Pandoc.Builder as B @@ -45,20 +47,22 @@ import Text.Pandoc.Readers.HTML (htmlInBalanced, htmlTag, isBlockTag, isCommentTag, isInlineTag, isTextTag) import Text.Pandoc.Readers.LaTeX (applyMacros, rawLaTeXBlock, rawLaTeXInline) import Text.Pandoc.Shared -import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.XML (fromEntities) -import Text.Pandoc.Readers.Metadata (yamlBsToMeta, yamlBsToRefs) +import Text.Pandoc.Readers.Metadata (yamlBsToMeta, yamlBsToRefs, yamlMetaBlock) +-- import Debug.Trace (traceShowId) -type MarkdownParser m = ParserT Text ParserState m +type MarkdownParser m = ParserT Sources ParserState m + +type F = Future ParserState -- | Read markdown from an input string and return a Pandoc document. -readMarkdown :: PandocMonad m +readMarkdown :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> a -- ^ Input -> m Pandoc readMarkdown opts s = do parsed <- readWithM parseMarkdown def{ stateOptions = opts } - (crFilter s <> "\n\n") + (ensureFinalNewlines 3 (toSources s)) case parsed of Right result -> return result Left e -> throwError e @@ -79,7 +83,7 @@ yamlToMeta opts mbfp bstr = do meta <- yamlBsToMeta (fmap B.toMetaValue <$> parseBlocks) bstr setPosition oldPos return $ runF meta defaultParserState - parsed <- readWithM parser def{ stateOptions = opts } "" + parsed <- readWithM parser def{ stateOptions = opts } ("" :: Text) case parsed of Right result -> return result Left e -> throwError e @@ -95,14 +99,12 @@ yamlToRefs :: PandocMonad m -> m [MetaValue] yamlToRefs idpred opts mbfp bstr = do let parser = do - oldPos <- getPosition case mbfp of Nothing -> return () Just fp -> setPosition $ initialPos fp refs <- yamlBsToRefs (fmap B.toMetaValue <$> parseBlocks) idpred bstr - setPosition oldPos return $ runF refs defaultParserState - parsed <- readWithM parser def{ stateOptions = opts } "" + parsed <- readWithM parser def{ stateOptions = opts } ("" :: Text) case parsed of Right result -> return result Left e -> throwError e @@ -145,14 +147,14 @@ inList = do ctx <- stateParserContext <$> getState guard (ctx == ListItemState) -spnl :: PandocMonad m => ParserT Text st m () +spnl :: PandocMonad m => ParserT Sources st m () spnl = try $ do skipSpaces optional newline skipSpaces notFollowedBy (char '\n') -spnl' :: PandocMonad m => ParserT Text st m Text +spnl' :: PandocMonad m => ParserT Sources st m Text spnl' = try $ do xs <- many spaceChar ys <- option "" $ try $ (:) <$> newline @@ -247,51 +249,48 @@ titleBlock :: PandocMonad m => MarkdownParser m () titleBlock = pandocTitleBlock <|> mmdTitleBlock pandocTitleBlock :: PandocMonad m => MarkdownParser m () -pandocTitleBlock = try $ do +pandocTitleBlock = do guardEnabled Ext_pandoc_title_block lookAhead (char '%') - title <- option mempty titleLine - author <- option (return []) authorsLine - date <- option mempty dateLine - optional blanklines - let meta' = do title' <- title - author' <- author - date' <- date - return $ - (if null title' then id else B.setMeta "title" title') - . (if null author' then id else B.setMeta "author" author') - . (if null date' then id else B.setMeta "date" date') - $ nullMeta - updateState $ \st -> st{ stateMeta' = stateMeta' st <> meta' } - -yamlMetaBlock :: PandocMonad m => MarkdownParser m (F Blocks) -yamlMetaBlock = try $ do + try $ do + title <- option mempty titleLine + author <- option (return []) authorsLine + date <- option mempty dateLine + optional blanklines + let meta' = do title' <- title + author' <- author + date' <- date + return $ + (if null title' + then id + else B.setMeta "title" title') + . (if null author' + then id + else B.setMeta "author" author') + . (if null date' + then id + else B.setMeta "date" date') + $ nullMeta + updateState $ \st -> st{ stateMeta' = stateMeta' st <> meta' } + +yamlMetaBlock' :: PandocMonad m => MarkdownParser m (F Blocks) +yamlMetaBlock' = do guardEnabled Ext_yaml_metadata_block - string "---" - blankline - notFollowedBy blankline -- if --- is followed by a blank it's an HRULE - rawYamlLines <- manyTill anyLine stopLine - -- by including --- and ..., we allow yaml blocks with just comments: - let rawYaml = T.unlines ("---" : (rawYamlLines ++ ["..."])) - optional blanklines - newMetaF <- yamlBsToMeta (fmap B.toMetaValue <$> parseBlocks) - $ UTF8.fromTextLazy $ TL.fromStrict rawYaml + newMetaF <- yamlMetaBlock (fmap B.toMetaValue <$> parseBlocks) -- Since `<>` is left-biased, existing values are not touched: updateState $ \st -> st{ stateMeta' = stateMeta' st <> newMetaF } return mempty -stopLine :: PandocMonad m => MarkdownParser m () -stopLine = try $ (string "---" <|> string "...") >> blankline >> return () - mmdTitleBlock :: PandocMonad m => MarkdownParser m () -mmdTitleBlock = try $ do +mmdTitleBlock = do guardEnabled Ext_mmd_title_block - firstPair <- kvPair False - restPairs <- many (kvPair True) - let kvPairs = firstPair : restPairs - blanklines - updateState $ \st -> st{ stateMeta' = stateMeta' st <> - return (Meta $ M.fromList kvPairs) } + try $ do + firstPair <- kvPair False + restPairs <- many (kvPair True) + let kvPairs = firstPair : restPairs + blanklines + updateState $ \st -> st{ stateMeta' = stateMeta' st <> + return (Meta $ M.fromList kvPairs) } kvPair :: PandocMonad m => Bool -> MarkdownParser m (Text, MetaValue) kvPair allowEmpty = try $ do @@ -300,7 +299,7 @@ kvPair allowEmpty = try $ do (try $ newline >> lookAhead (blankline <|> nonspaceChar)) guard $ allowEmpty || not (T.null val) let key' = T.concat $ T.words $ T.toLower key - let val' = MetaBlocks $ B.toList $ B.plain $ B.text val + let val' = MetaInlines $ B.toList $ B.text val return (key',val') parseMarkdown :: PandocMonad m => MarkdownParser m Pandoc @@ -334,10 +333,14 @@ referenceKey = try $ do skipMany spaceChar notFollowedBy' referenceTitle notFollowedBy' $ guardEnabled Ext_link_attributes >> attributes + notFollowedBy' $ guardEnabled Ext_mmd_link_attributes >> + try (spnl <* keyValAttr) notFollowedBy' (() <$ reference) many1Char $ notFollowedBy space >> litChar let betweenAngles = try $ char '<' >> manyTillChar litChar (char '>') - src <- try betweenAngles <|> sourceURL + rebase <- option False (True <$ guardEnabled Ext_rebase_relative_paths) + src <- (if rebase then rebasePath pos else id) <$> + (try betweenAngles <|> sourceURL) tit <- option "" referenceTitle attr <- option nullAttr $ try $ do guardEnabled Ext_link_attributes @@ -346,7 +349,7 @@ referenceKey = try $ do addKvs <- option [] $ guardEnabled Ext_mmd_link_attributes >> many (try $ spnl >> keyValAttr) blanklines - let attr' = extractIdClass $ foldl (\x f -> f x) attr addKvs + let attr' = extractIdClass $ foldl' (\x f -> f x) attr addKvs target = (escapeURI $ trimr src, tit) st <- getState let oldkeys = stateKeys st @@ -442,7 +445,7 @@ block :: PandocMonad m => MarkdownParser m (F Blocks) block = do res <- choice [ mempty <$ blanklines , codeBlockFenced - , yamlMetaBlock + , yamlMetaBlock' -- note: bulletList needs to be before header because of -- the possibility of empty list items: - , bulletList @@ -568,7 +571,7 @@ registerImplicitHeader raw attr@(ident, _, _) -- hrule block -- -hrule :: PandocMonad m => ParserT Text st m (F Blocks) +hrule :: PandocMonad m => ParserT Sources st m (F Blocks) hrule = try $ do skipSpaces start <- satisfy isHruleChar @@ -588,7 +591,7 @@ indentedLine = indentSpaces >> anyLineNewline blockDelimiter :: PandocMonad m => (Char -> Bool) -> Maybe Int - -> ParserT Text ParserState m Int + -> ParserT Sources ParserState m Int blockDelimiter f len = try $ do skipNonindentSpaces c <- lookAhead (satisfy f) @@ -602,7 +605,7 @@ attributes = try $ do spnl attrs <- many (attribute <* spnl) char '}' - return $ foldl (\x f -> f x) nullAttr attrs + return $ foldl' (\x f -> f x) nullAttr attrs attribute :: PandocMonad m => MarkdownParser m (Attr -> Attr) attribute = identifierAttr <|> classAttr <|> keyValAttr <|> specialAttr @@ -659,15 +662,15 @@ codeBlockFenced :: PandocMonad m => MarkdownParser m (F Blocks) codeBlockFenced = try $ do indentchars <- nonindentSpaces let indentLevel = T.length indentchars - c <- try (guardEnabled Ext_fenced_code_blocks >> lookAhead (char '~')) + c <- (guardEnabled Ext_fenced_code_blocks >> lookAhead (char '~')) <|> (guardEnabled Ext_backtick_code_blocks >> lookAhead (char '`')) size <- blockDelimiter (== c) Nothing skipMany spaceChar rawattr <- - (Left <$> try (guardEnabled Ext_raw_attribute >> rawAttribute)) + (Left <$> (guardEnabled Ext_raw_attribute >> try rawAttribute)) <|> (Right <$> option ("",[],[]) - (try (guardEnabled Ext_fenced_code_attributes >> attributes) + ((guardEnabled Ext_fenced_code_attributes >> try attributes) <|> ((\x -> ("",[toLanguageId x],[])) <$> many1Char nonspaceChar))) blankline contents <- T.intercalate "\n" <$> @@ -732,7 +735,7 @@ lhsCodeBlockBirdWith c = try $ do blanklines return $ T.intercalate "\n" lns' -birdTrackLine :: PandocMonad m => Char -> ParserT Text st m Text +birdTrackLine :: PandocMonad m => Char -> ParserT Sources st m Text birdTrackLine c = try $ do char c -- allow html tags on left margin: @@ -1025,7 +1028,7 @@ para = try $ do option (B.plain <$> result) $ try $ do newline - (blanklines >> return mempty) + (mempty <$ blanklines) <|> (guardDisabled Ext_blank_before_blockquote >> () <$ lookAhead blockQuote) <|> (guardEnabled Ext_backtick_code_blocks >> () <$ lookAhead codeBlockFenced) <|> (guardDisabled Ext_blank_before_header >> () <$ lookAhead header) @@ -1118,6 +1121,7 @@ rawTeXBlock = do rawHtmlBlocks :: PandocMonad m => MarkdownParser m (F Blocks) rawHtmlBlocks = do (TagOpen tagtype _, raw) <- htmlTag isBlockTag + let selfClosing = "/>" `T.isSuffixOf` raw -- we don't want '<td> text' to be a code block: skipMany spaceChar indentlevel <- (blankline >> length <$> many (char ' ')) <|> return 0 @@ -1131,7 +1135,9 @@ rawHtmlBlocks = do gobbleAtMostSpaces indentlevel notFollowedBy' closer block - contents <- mconcat <$> many block' + contents <- if selfClosing + then return mempty + else mconcat <$> many block' result <- try (do gobbleAtMostSpaces indentlevel @@ -1155,11 +1161,12 @@ stripMarkdownAttribute s = renderTags' $ map filterAttrib $ parseTags s -- lineBlock :: PandocMonad m => MarkdownParser m (F Blocks) -lineBlock = try $ do +lineBlock = do guardEnabled Ext_line_blocks - lines' <- lineBlockLines >>= - mapM (parseFromString' (trimInlinesF <$> inlines)) - return $ B.lineBlock <$> sequence lines' + try $ do + lines' <- lineBlockLines >>= + mapM (parseFromString' (trimInlinesF <$> inlines)) + return $ B.lineBlock <$> sequence lines' -- -- Tables @@ -1169,7 +1176,7 @@ lineBlock = try $ do -- and the length including trailing space. dashedLine :: PandocMonad m => Char - -> ParserT Text st m (Int, Int) + -> ParserT Sources st m (Int, Int) dashedLine ch = do dashes <- many1 (char ch) sp <- many spaceChar @@ -1238,7 +1245,7 @@ rawTableLine :: PandocMonad m -> MarkdownParser m [Text] rawTableLine indices = do notFollowedBy' (blanklines' <|> tableFooter) - line <- take1WhileP (/='\n') <* newline + line <- anyLine return $ map trim $ tail $ splitTextByIndices (init indices) line @@ -1261,11 +1268,12 @@ multilineRow indices = do -- Parses a table caption: inlines beginning with 'Table:' -- and followed by blank lines. tableCaption :: PandocMonad m => MarkdownParser m (F Inlines) -tableCaption = try $ do +tableCaption = do guardEnabled Ext_table_captions - skipNonindentSpaces - (string ":" <* notFollowedBy (satisfy isPunctuation)) <|> string "Table:" - trimInlinesF <$> inlines1 <* blanklines + try $ do + skipNonindentSpaces + (string ":" <* notFollowedBy (satisfy isPunctuation)) <|> string "Table:" + trimInlinesF <$> inlines1 <* blanklines -- Parse a simple table with '---' header and one line per row. simpleTable :: PandocMonad m @@ -1351,7 +1359,7 @@ pipeTable = try $ do lines' <- many pipeTableRow let lines'' = map (take (length aligns) <$>) lines' let maxlength = maximum $ - map (\x -> T.length . stringify $ runF x def) (heads' : lines'') + fmap (\x -> T.length . stringify $ runF x def) (heads' :| lines'') numColumns <- getOption readerColumns let widths = if maxlength > numColumns then map (\len -> @@ -1388,7 +1396,7 @@ pipeTableCell = return $ B.plain <$> result) <|> return mempty -pipeTableHeaderPart :: PandocMonad m => ParserT Text st m (Alignment, Int) +pipeTableHeaderPart :: PandocMonad m => ParserT Sources st m (Alignment, Int) pipeTableHeaderPart = try $ do skipMany spaceChar left <- optionMaybe (char ':') @@ -1404,10 +1412,14 @@ pipeTableHeaderPart = try $ do (Just _,Just _) -> AlignCenter, len) -- Succeed only if current line contains a pipe. -scanForPipe :: PandocMonad m => ParserT Text st m () +scanForPipe :: PandocMonad m => ParserT Sources st m () scanForPipe = do - inp <- getInput - case T.break (\c -> c == '\n' || c == '|') inp of + Sources inps <- getInput + let ln = case inps of + [] -> "" + ((_,t):(_,t'):_) | T.null t -> t' + ((_,t):_) -> t + case T.break (\c -> c == '\n' || c == '|') ln of (_, T.uncons -> Just ('|', _)) -> return () _ -> mzero @@ -1434,15 +1446,14 @@ table :: PandocMonad m => MarkdownParser m (F Blocks) table = try $ do frontCaption <- option Nothing (Just <$> tableCaption) (aligns, widths, heads, lns) <- - try (guardEnabled Ext_pipe_tables >> scanForPipe >> pipeTable) <|> - try (guardEnabled Ext_multiline_tables >> - multilineTable False) <|> - try (guardEnabled Ext_simple_tables >> - (simpleTable True <|> simpleTable False)) <|> - try (guardEnabled Ext_multiline_tables >> - multilineTable True) <|> - try (guardEnabled Ext_grid_tables >> - (gridTable False <|> gridTable True)) <?> "table" + (guardEnabled Ext_pipe_tables >> try (scanForPipe >> pipeTable)) <|> + (guardEnabled Ext_multiline_tables >> try (multilineTable False)) <|> + (guardEnabled Ext_simple_tables >> + try (simpleTable True <|> simpleTable False)) <|> + (guardEnabled Ext_multiline_tables >> + try (multilineTable True)) <|> + (guardEnabled Ext_grid_tables >> + try (gridTable False <|> gridTable True)) <?> "table" optional blanklines caption <- case frontCaption of Nothing -> option (return mempty) tableCaption @@ -1476,35 +1487,37 @@ inlines1 :: PandocMonad m => MarkdownParser m (F Inlines) inlines1 = mconcat <$> many1 inline inline :: PandocMonad m => MarkdownParser m (F Inlines) -inline = choice [ whitespace - , bareURL - , str - , endline - , code - , strongOrEmph - , note - , cite - , bracketedSpan - , link - , image - , math - , strikeout - , subscript - , superscript - , inlineNote -- after superscript because of ^[link](/foo)^ - , autoLink - , spanHtml - , rawHtmlInline - , escapedNewline - , escapedChar - , rawLaTeXInline' - , exampleRef - , smart - , return . B.singleton <$> charRef - , emoji - , symbol - , ltSign - ] <?> "inline" +inline = do + c <- lookAhead anyChar + ((case c of + ' ' -> whitespace + '\t' -> whitespace + '\n' -> endline + '`' -> code + '_' -> strongOrEmph + '*' -> strongOrEmph + '^' -> superscript <|> inlineNote -- in this order bc ^[link](/foo)^ + '[' -> note <|> cite <|> bracketedSpan <|> link + '!' -> image + '$' -> math + '~' -> strikeout <|> subscript + '<' -> autoLink <|> spanHtml <|> rawHtmlInline <|> ltSign + '\\' -> math <|> escapedNewline <|> escapedChar <|> rawLaTeXInline' + '@' -> cite <|> exampleRef + '"' -> smart + '\'' -> smart + '\8216' -> smart + '\145' -> smart + '\8220' -> smart + '\147' -> smart + '-' -> smart + '.' -> smart + '&' -> return . B.singleton <$> charRef + ':' -> emoji + _ -> mzero) + <|> bareURL + <|> str + <|> symbol) <?> "inline" escapedChar' :: PandocMonad m => MarkdownParser m Char escapedChar' = try $ do @@ -1515,11 +1528,12 @@ escapedChar' = try $ do <|> oneOf "\\`*_{}[]()>#+-.!~\"" escapedNewline :: PandocMonad m => MarkdownParser m (F Inlines) -escapedNewline = try $ do +escapedNewline = do guardEnabled Ext_escaped_line_breaks - char '\\' - lookAhead (char '\n') -- don't consume the newline (see #3730) - return $ return B.linebreak + try $ do + char '\\' + lookAhead (char '\n') -- don't consume the newline (see #3730) + return $ return B.linebreak escapedChar :: PandocMonad m => MarkdownParser m (F Inlines) escapedChar = do @@ -1541,19 +1555,20 @@ ltSign = do -- whole document has been parsed. But we need this parser -- here in case citations is disabled. exampleRef :: PandocMonad m => MarkdownParser m (F Inlines) -exampleRef = try $ do +exampleRef = do guardEnabled Ext_example_lists - char '@' - lab <- mconcat . map T.pack <$> - many (many1 alphaNum <|> - try (do c <- char '_' <|> char '-' - cs <- many1 alphaNum - return (c:cs))) - return $ do - st <- askF - return $ case M.lookup lab (stateExamples st) of - Just n -> B.str $ tshow n - Nothing -> B.str $ "@" <> lab + try $ do + char '@' + lab <- mconcat . map T.pack <$> + many (many1 alphaNum <|> + try (do c <- char '_' <|> char '-' + cs <- many1 alphaNum + return (c:cs))) + return $ do + st <- askF + return $ case M.lookup lab (stateExamples st) of + Just n -> B.str $ tshow n + Nothing -> B.str $ "@" <> lab symbol :: PandocMonad m => MarkdownParser m (F Inlines) symbol = do @@ -1580,10 +1595,10 @@ code = try $ do >> count (length starts) (char '`') >> notFollowedBy (char '`')) rawattr <- - (Left <$> try (guardEnabled Ext_raw_attribute >> rawAttribute)) + (Left <$> (guardEnabled Ext_raw_attribute >> try rawAttribute)) <|> (Right <$> option ("",[],[]) - (try (guardEnabled Ext_inline_code_attributes >> attributes))) + (guardEnabled Ext_inline_code_attributes >> try attributes)) return $ return $ case rawattr of Left syn -> B.rawInline syn result @@ -1676,38 +1691,40 @@ strikeout = fmap B.strikeout <$> strikeEnd = try $ string "~~" superscript :: PandocMonad m => MarkdownParser m (F Inlines) -superscript = fmap B.superscript <$> try (do +superscript = do guardEnabled Ext_superscript - char '^' - mconcat <$> many1Till (do notFollowedBy spaceChar - notFollowedBy newline - inline) (char '^')) + fmap B.superscript <$> try (do + char '^' + mconcat <$> many1Till (do notFollowedBy spaceChar + notFollowedBy newline + inline) (char '^')) subscript :: PandocMonad m => MarkdownParser m (F Inlines) -subscript = fmap B.subscript <$> try (do +subscript = do guardEnabled Ext_subscript - char '~' - mconcat <$> many1Till (do notFollowedBy spaceChar - notFollowedBy newline - inline) (char '~')) + fmap B.subscript <$> try (do + char '~' + mconcat <$> many1Till (do notFollowedBy spaceChar + notFollowedBy newline + inline) (char '~')) whitespace :: PandocMonad m => MarkdownParser m (F Inlines) whitespace = spaceChar >> return <$> (lb <|> regsp) <?> "whitespace" where lb = spaceChar >> skipMany spaceChar >> option B.space (endline >> return B.linebreak) regsp = skipMany spaceChar >> return B.space -nonEndline :: PandocMonad m => ParserT Text st m Char +nonEndline :: PandocMonad m => ParserT Sources st m Char nonEndline = satisfy (/='\n') str :: PandocMonad m => MarkdownParser m (F Inlines) str = do result <- mconcat <$> many1 - ( take1WhileP isAlphaNum + ( T.pack <$> (many1 alphaNum) <|> "." <$ try (char '.' <* notFollowedBy (char '.')) ) updateLastStrPos (do guardEnabled Ext_smart abbrevs <- getOption readerAbbreviations - if not (T.null result) && T.last result == '.' && result `Set.member` abbrevs + if result `Set.member` abbrevs then try (do ils <- whitespace notFollowedBy (() <$ cite <|> () <$ note) -- ?? lookAhead alphaNum @@ -1790,15 +1807,16 @@ link = try $ do regLink B.linkWith lab <|> referenceLink B.linkWith (lab,raw) bracketedSpan :: PandocMonad m => MarkdownParser m (F Inlines) -bracketedSpan = try $ do +bracketedSpan = do guardEnabled Ext_bracketed_spans - (lab,_) <- reference - attr <- attributes - return $ if isSmallCaps attr - then B.smallcaps <$> lab - else if isUnderline attr - then B.underline <$> lab - else B.spanWith attr <$> lab + try $ do + (lab,_) <- reference + attr <- attributes + return $ if isSmallCaps attr + then B.smallcaps <$> lab + else if isUnderline attr + then B.underline <$> lab + else B.spanWith attr <$> lab -- | We treat a span as SmallCaps if class is "smallcaps" (and -- no other attributes are set or if style is "font-variant:small-caps" @@ -1825,9 +1843,12 @@ regLink :: PandocMonad m -> MarkdownParser m (F Inlines) regLink constructor lab = try $ do (src, tit) <- source + rebase <- option False (True <$ guardEnabled Ext_rebase_relative_paths) + pos <- getPosition + let src' = if rebase then rebasePath pos src else src attr <- option nullAttr $ guardEnabled Ext_link_attributes >> attributes - return $ constructor attr src tit <$> lab + return $ constructor attr src' tit <$> lab -- a link like [this][ref] or [this][] or [this] referenceLink :: PandocMonad m @@ -1867,7 +1888,8 @@ referenceLink constructor (lab, raw) = do Just ((src, tit), _) -> constructor nullAttr src tit <$> lab Nothing -> makeFallback else makeFallback - Just ((src,tit), attr) -> constructor attr src tit <$> lab + Just ((src,tit), attr) -> + constructor attr src tit <$> lab dropBrackets :: Text -> Text dropBrackets = dropRB . dropLB @@ -1877,12 +1899,13 @@ dropBrackets = dropRB . dropLB dropLB xs = xs bareURL :: PandocMonad m => MarkdownParser m (F Inlines) -bareURL = try $ do +bareURL = do guardEnabled Ext_autolink_bare_uris getState >>= guard . stateAllowLinks - (cls, (orig, src)) <- (("uri",) <$> uri) <|> (("email",) <$> emailAddress) - notFollowedBy $ try $ spaces >> htmlTag (~== TagClose ("a" :: Text)) - return $ return $ B.linkWith ("",[cls],[]) src "" (B.str orig) + try $ do + (cls, (orig, src)) <- (("uri",) <$> uri) <|> (("email",) <$> emailAddress) + notFollowedBy $ try $ spaces >> htmlTag (~== TagClose ("a" :: Text)) + return $ return $ B.linkWith ("",[cls],[]) src "" (B.str orig) autoLink :: PandocMonad m => MarkdownParser m (F Inlines) autoLink = try $ do @@ -1899,15 +1922,33 @@ autoLink = try $ do return $ return $ B.linkWith attr (src <> escapeURI extra) "" (B.str $ orig <> extra) +-- | Rebase a relative path, by adding the (relative) directory +-- of the containing source position. Absolute links and URLs +-- are untouched. +rebasePath :: SourcePos -> Text -> Text +rebasePath pos path = do + let fp = sourceName pos + isFragment = T.take 1 path == "#" + path' = T.unpack path + isAbsolutePath = Posix.isAbsolute path' || Windows.isAbsolute path' + in if T.null path || isFragment || isAbsolutePath || isURI path + then path + else + case takeDirectory fp of + "" -> path + "." -> path + d -> T.pack d <> "/" <> path + image :: PandocMonad m => MarkdownParser m (F Inlines) image = try $ do char '!' (lab,raw) <- reference defaultExt <- getOption readerDefaultImageExtension - let constructor attr' src = case takeExtension (T.unpack src) of - "" -> B.imageWith attr' (T.pack $ addExtension (T.unpack src) - $ T.unpack defaultExt) - _ -> B.imageWith attr' src + let constructor attr' src = + case takeExtension (T.unpack src) of + "" -> B.imageWith attr' (T.pack $ addExtension (T.unpack src) + $ T.unpack defaultExt) + _ -> B.imageWith attr' src regLink constructor lab <|> referenceLink constructor (lab,raw) note :: PandocMonad m => MarkdownParser m (F Inlines) @@ -1935,23 +1976,25 @@ note = try $ do return $ B.note $ walk adjustCite contents' inlineNote :: PandocMonad m => MarkdownParser m (F Inlines) -inlineNote = try $ do +inlineNote = do guardEnabled Ext_inline_notes - char '^' - updateState $ \st -> st{ stateInNote = True - , stateNoteNumber = stateNoteNumber st + 1 } - contents <- inlinesInBalancedBrackets - updateState $ \st -> st{ stateInNote = False } - return $ B.note . B.para <$> contents + try $ do + char '^' + updateState $ \st -> st{ stateInNote = True + , stateNoteNumber = stateNoteNumber st + 1 } + contents <- inlinesInBalancedBrackets + updateState $ \st -> st{ stateInNote = False } + return $ B.note . B.para <$> contents rawLaTeXInline' :: PandocMonad m => MarkdownParser m (F Inlines) -rawLaTeXInline' = try $ do +rawLaTeXInline' = do guardEnabled Ext_raw_tex notFollowedBy' rawConTeXtEnvironment - s <- rawLaTeXInline - return $ return $ B.rawInline "tex" s -- "tex" because it might be context + try $ do + s <- rawLaTeXInline + return $ return $ B.rawInline "tex" s -- "tex" because it might be context -rawConTeXtEnvironment :: PandocMonad m => ParserT Text st m Text +rawConTeXtEnvironment :: PandocMonad m => ParserT Sources st m Text rawConTeXtEnvironment = try $ do string "\\start" completion <- inBrackets (letter <|> digit <|> spaceChar) @@ -1960,7 +2003,7 @@ rawConTeXtEnvironment = try $ do (try $ string "\\stop" >> textStr completion) return $ "\\start" <> completion <> T.concat contents <> "\\stop" <> completion -inBrackets :: PandocMonad m => ParserT Text st m Char -> ParserT Text st m Text +inBrackets :: PandocMonad m => ParserT Sources st m Char -> ParserT Sources st m Text inBrackets parser = do char '[' contents <- manyChar parser @@ -1968,55 +2011,60 @@ inBrackets parser = do return $ "[" <> contents <> "]" spanHtml :: PandocMonad m => MarkdownParser m (F Inlines) -spanHtml = try $ do +spanHtml = do guardEnabled Ext_native_spans - (TagOpen _ attrs, _) <- htmlTag (~== TagOpen ("span" :: Text) []) - contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose ("span" :: Text))) - let ident = fromMaybe "" $ lookup "id" attrs - let classes = maybe [] T.words $ lookup "class" attrs - let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] - return $ if isSmallCaps (ident, classes, keyvals) - then B.smallcaps <$> contents - else if isUnderline (ident, classes, keyvals) - then B.underline <$> contents - else B.spanWith (ident, classes, keyvals) <$> contents + try $ do + (TagOpen _ attrs, _) <- htmlTag (~== TagOpen ("span" :: Text) []) + contents <- mconcat <$> manyTill inline (htmlTag (~== TagClose ("span" :: Text))) + let ident = fromMaybe "" $ lookup "id" attrs + let classes = maybe [] T.words $ lookup "class" attrs + let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + return $ if isSmallCaps (ident, classes, keyvals) + then B.smallcaps <$> contents + else if isUnderline (ident, classes, keyvals) + then B.underline <$> contents + else B.spanWith (ident, classes, keyvals) <$> contents divHtml :: PandocMonad m => MarkdownParser m (F Blocks) -divHtml = try $ do +divHtml = do guardEnabled Ext_native_divs - (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen ("div" :: Text) []) - -- we set stateInHtmlBlock so that closing tags that can be either block or - -- inline will not be parsed as inline tags - oldInHtmlBlock <- stateInHtmlBlock <$> getState - updateState $ \st -> st{ stateInHtmlBlock = Just "div" } - bls <- option "" (blankline >> option "" blanklines) - contents <- mconcat <$> - many (notFollowedBy' (htmlTag (~== TagClose ("div" :: Text))) >> block) - closed <- option False (True <$ htmlTag (~== TagClose ("div" :: Text))) - if closed - then do - updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock } - let ident = fromMaybe "" $ lookup "id" attrs - let classes = maybe [] T.words $ lookup "class" attrs - let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] - return $ B.divWith (ident, classes, keyvals) <$> contents - else -- avoid backtracing - return $ return (B.rawBlock "html" (rawtag <> bls)) <> contents + try $ do + (TagOpen _ attrs, rawtag) <- htmlTag (~== TagOpen ("div" :: Text) []) + -- we set stateInHtmlBlock so that closing tags that can be either block + -- or inline will not be parsed as inline tags + oldInHtmlBlock <- stateInHtmlBlock <$> getState + updateState $ \st -> st{ stateInHtmlBlock = Just "div" } + bls <- option "" (blankline >> option "" blanklines) + contents <- mconcat <$> + many (notFollowedBy' (htmlTag (~== TagClose ("div" :: Text))) >> block) + closed <- option False (True <$ htmlTag (~== TagClose ("div" :: Text))) + if closed + then do + updateState $ \st -> st{ stateInHtmlBlock = oldInHtmlBlock } + let ident = fromMaybe "" $ lookup "id" attrs + let classes = maybe [] T.words $ lookup "class" attrs + let keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] + return $ B.divWith (ident, classes, keyvals) <$> contents + else -- avoid backtracing + return $ return (B.rawBlock "html" (rawtag <> bls)) <> contents divFenced :: PandocMonad m => MarkdownParser m (F Blocks) -divFenced = try $ do +divFenced = do guardEnabled Ext_fenced_divs - string ":::" - skipMany (char ':') - skipMany spaceChar - attribs <- attributes <|> ((\x -> ("",[x],[])) <$> many1Char nonspaceChar) - skipMany spaceChar - skipMany (char ':') - blankline - updateState $ \st -> st{ stateFencedDivLevel = stateFencedDivLevel st + 1 } - bs <- mconcat <$> manyTill block divFenceEnd - updateState $ \st -> st{ stateFencedDivLevel = stateFencedDivLevel st - 1 } - return $ B.divWith attribs <$> bs + try $ do + string ":::" + skipMany (char ':') + skipMany spaceChar + attribs <- attributes <|> ((\x -> ("",[x],[])) <$> many1Char nonspaceChar) + skipMany spaceChar + skipMany (char ':') + blankline + updateState $ \st -> + st{ stateFencedDivLevel = stateFencedDivLevel st + 1 } + bs <- mconcat <$> manyTill block divFenceEnd + updateState $ \st -> + st{ stateFencedDivLevel = stateFencedDivLevel st - 1 } + return $ B.divWith attribs <$> bs divFenceEnd :: PandocMonad m => MarkdownParser m () divFenceEnd = try $ do @@ -2048,14 +2096,15 @@ emojiChars :: [Char] emojiChars = ['a'..'z'] ++ ['0'..'9'] ++ ['_','+','-'] emoji :: PandocMonad m => MarkdownParser m (F Inlines) -emoji = try $ do +emoji = do guardEnabled Ext_emoji - char ':' - emojikey <- many1Char (oneOf emojiChars) - char ':' - case emojiToInline emojikey of - Just i -> return (return $ B.singleton i) - Nothing -> mzero + try $ do + char ':' + emojikey <- many1Char (oneOf emojiChars) + char ':' + case emojiToInline emojikey of + Just i -> return (return $ B.singleton i) + Nothing -> mzero -- Citations @@ -2074,7 +2123,7 @@ cite = do textualCite :: PandocMonad m => MarkdownParser m (F Inlines) textualCite = try $ do - (suppressAuthor, key) <- citeKey + (suppressAuthor, key) <- citeKey True -- If this is a reference to an earlier example list item, -- then don't parse it as a citation. If the example list -- item comes later, we'll parse it here and figure out in @@ -2154,7 +2203,7 @@ prefix = trimInlinesF . mconcat <$> manyTill inline (char ']' <|> lookAhead (try $ do optional (try (char ';' >> spnl)) - citeKey + citeKey True return ']')) citeList :: PandocMonad m => MarkdownParser m (F [Citation]) @@ -2163,7 +2212,7 @@ citeList = fmap sequence $ sepBy1 citation (try $ char ';' >> spnl) citation :: PandocMonad m => MarkdownParser m (F Citation) citation = try $ do pref <- prefix - (suppress_author, key) <- citeKey + (suppress_author, key) <- citeKey True suff <- suffix noteNum <- stateNoteNumber <$> getState return $ do @@ -2182,28 +2231,30 @@ citation = try $ do smart :: PandocMonad m => MarkdownParser m (F Inlines) smart = do guardEnabled Ext_smart - doubleQuoted <|> singleQuoted <|> - choice (map (return <$>) [apostrophe, dash, ellipses]) + doubleQuoted <|> singleQuoted <|> (return <$> doubleCloseQuote) <|> + (return <$> apostrophe) <|> (return <$> dash) <|> (return <$> ellipses) singleQuoted :: PandocMonad m => MarkdownParser m (F Inlines) -singleQuoted = try $ do +singleQuoted = do singleQuoteStart - withQuoteContext InSingleQuote $ + (try (withQuoteContext InSingleQuote $ fmap B.singleQuoted . trimInlinesF . mconcat <$> - many1Till inline singleQuoteEnd + many1Till inline singleQuoteEnd)) + <|> (return (return (B.str "\8217"))) -- doubleQuoted will handle regular double-quoted sections, as well -- as dialogues with an open double-quote without a close double-quote -- in the same paragraph. doubleQuoted :: PandocMonad m => MarkdownParser m (F Inlines) -doubleQuoted = try $ do +doubleQuoted = do doubleQuoteStart - withQuoteContext InDoubleQuote $ + (try (withQuoteContext InDoubleQuote $ fmap B.doubleQuoted . trimInlinesF . mconcat <$> - many1Till inline doubleQuoteEnd + many1Till inline doubleQuoteEnd)) + <|> (return (return (B.str "\8220"))) toRow :: [Blocks] -> Row toRow = Row nullAttr . map B.simpleCell toHeaderRow :: [Blocks] -> [Row] -toHeaderRow l = [toRow l | not (null l)] +toHeaderRow l = [toRow l | not (null l) && not (all null l)] diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index e8985ab2c..825e4a2eb 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.MediaWiki - Copyright : Copyright (C) 2012-2020 John MacFarlane + Copyright : Copyright (C) 2012-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -36,17 +36,18 @@ import Text.Pandoc.Logging import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (nested) import Text.Pandoc.Readers.HTML (htmlTag, isBlockTag, isCommentTag) -import Text.Pandoc.Shared (crFilter, safeRead, stringify, stripTrailingNewlines, +import Text.Pandoc.Shared (safeRead, stringify, stripTrailingNewlines, trim, splitTextBy, tshow) import Text.Pandoc.Walk (walk) import Text.Pandoc.XML (fromEntities) -- | Read mediawiki from an input string and return a Pandoc document. -readMediaWiki :: PandocMonad m - => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assuming @'\n'@ line endings) +readMediaWiki :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a -> m Pandoc readMediaWiki opts s = do + let sources = toSources s parsed <- readWithM parseMediaWiki MWState{ mwOptions = opts , mwMaxNestingLevel = 4 , mwNextLinkNumber = 1 @@ -55,7 +56,7 @@ readMediaWiki opts s = do , mwLogMessages = [] , mwInTT = False } - (crFilter s <> "\n") + sources case parsed of Right result -> return result Left e -> throwError e @@ -69,7 +70,7 @@ data MWState = MWState { mwOptions :: ReaderOptions , mwInTT :: Bool } -type MWParser m = ParserT Text MWState m +type MWParser m = ParserT Sources MWState m instance HasReaderOptions MWState where extractReaderOptions = mwOptions @@ -112,12 +113,14 @@ newBlockTags = ["haskell","syntaxhighlight","source","gallery","references"] isBlockTag' :: Tag Text -> Bool isBlockTag' tag@(TagOpen t _) = (isBlockTag tag || t `elem` newBlockTags) && t `notElem` eitherBlockOrInline +isBlockTag' (TagClose "ref") = True -- needed so 'special' doesn't parse it isBlockTag' tag@(TagClose t) = (isBlockTag tag || t `elem` newBlockTags) && t `notElem` eitherBlockOrInline isBlockTag' tag = isBlockTag tag isInlineTag' :: Tag Text -> Bool isInlineTag' (TagComment _) = True +isInlineTag' (TagClose "ref") = False -- see below inlineTag isInlineTag' t = not (isBlockTag' t) eitherBlockOrInline :: [Text] @@ -554,11 +557,17 @@ variable = try $ do contents <- manyTillChar anyChar (try $ string "}}}") return $ "{{{" <> contents <> "}}}" +singleParaToPlain :: Blocks -> Blocks +singleParaToPlain bs = + case B.toList bs of + [Para ils] -> B.fromList [Plain ils] + _ -> bs + inlineTag :: PandocMonad m => MWParser m Inlines inlineTag = do (tag, _) <- lookAhead $ htmlTag isInlineTag' case tag of - TagOpen "ref" _ -> B.note . B.plain <$> inlinesInTags "ref" + TagOpen "ref" _ -> B.note . singleParaToPlain <$> blocksInTags "ref" TagOpen "nowiki" _ -> try $ do (_,raw) <- htmlTag (~== tag) if T.any (== '/') raw @@ -678,19 +687,17 @@ url = do -- | Parses a list of inlines between start and end delimiters. inlinesBetween :: (PandocMonad m, Show b) => MWParser m a -> MWParser m b -> MWParser m Inlines inlinesBetween start end = - trimInlines . mconcat <$> try (start >> many1Till inner end) - where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) - innerSpace = try $ whitespace <* notFollowedBy' end + trimInlines . mconcat <$> try (start >> many1Till inline end) emph :: PandocMonad m => MWParser m Inlines emph = B.emph <$> nested (inlinesBetween start end) - where start = sym "''" >> lookAhead nonspaceChar + where start = sym "''" end = try $ notFollowedBy' (() <$ strong) >> sym "''" strong :: PandocMonad m => MWParser m Inlines strong = B.strong <$> nested (inlinesBetween start end) - where start = sym "'''" >> lookAhead nonspaceChar - end = try $ sym "'''" + where start = sym "'''" + end = sym "'''" doubleQuotes :: PandocMonad m => MWParser m Inlines doubleQuotes = do diff --git a/src/Text/Pandoc/Readers/Metadata.hs b/src/Text/Pandoc/Readers/Metadata.hs index a64b130e5..cbc523b25 100644 --- a/src/Text/Pandoc/Readers/Metadata.hs +++ b/src/Text/Pandoc/Readers/Metadata.hs @@ -2,7 +2,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Readers.Metadata - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -14,6 +14,7 @@ Parse YAML/JSON metadata to 'Pandoc' 'Meta'. module Text.Pandoc.Readers.Metadata ( yamlBsToMeta, yamlBsToRefs, + yamlMetaBlock, yamlMap ) where import Control.Monad @@ -30,11 +31,13 @@ import Text.Pandoc.Definition import Text.Pandoc.Error import Text.Pandoc.Parsing hiding (tableWith) import Text.Pandoc.Shared +import qualified Data.Text.Lazy as TL +import qualified Text.Pandoc.UTF8 as UTF8 -yamlBsToMeta :: PandocMonad m - => ParserT Text ParserState m (F MetaValue) +yamlBsToMeta :: (PandocMonad m, HasLastStrPosition st) + => ParserT Sources st m (Future st MetaValue) -> BL.ByteString - -> ParserT Text ParserState m (F Meta) + -> ParserT Sources st m (Future st Meta) yamlBsToMeta pMetaValue bstr = do case YAML.decodeNode' YAML.failsafeSchemaResolver False False bstr of Right (YAML.Doc (YAML.Mapping _ _ o):_) @@ -42,6 +45,9 @@ yamlBsToMeta pMetaValue bstr = do Right [] -> return . return $ mempty Right [YAML.Doc (YAML.Scalar _ YAML.SNull)] -> return . return $ mempty + -- the following is what we get from a comment: + Right [YAML.Doc (YAML.Scalar _ (YAML.SUnknown _ ""))] + -> return . return $ mempty Right _ -> Prelude.fail "expected YAML object" Left (yamlpos, err') -> do pos <- getPosition @@ -63,11 +69,11 @@ lookupYAML t (YAML.Mapping _ _ m) = lookupYAML _ _ = Nothing -- Returns filtered list of references. -yamlBsToRefs :: PandocMonad m - => ParserT Text ParserState m (F MetaValue) +yamlBsToRefs :: (PandocMonad m, HasLastStrPosition st) + => ParserT Sources st m (Future st MetaValue) -> (Text -> Bool) -- ^ Filter for id -> BL.ByteString - -> ParserT Text ParserState m (F [MetaValue]) + -> ParserT Sources st m (Future st [MetaValue]) yamlBsToRefs pMetaValue idpred bstr = case YAML.decodeNode' YAML.failsafeSchemaResolver False False bstr of Right (YAML.Doc o@YAML.Mapping{}:_) @@ -95,8 +101,12 @@ yamlBsToRefs pMetaValue idpred bstr = Right [YAML.Doc (YAML.Scalar _ YAML.SNull)] -> return . return $ mempty Right _ -> Prelude.fail "expecting YAML object" - Left (_pos, err') - -> Prelude.fail err' + Left (yamlpos, err') + -> do pos <- getPosition + setPosition $ incSourceLine + (setSourceColumn pos (YE.posColumn yamlpos)) + (YE.posLine yamlpos - 1) + Prelude.fail err' nodeToKey :: YAML.Node YE.Pos -> Maybe Text @@ -104,10 +114,10 @@ nodeToKey (YAML.Scalar _ (YAML.SStr t)) = Just t nodeToKey (YAML.Scalar _ (YAML.SUnknown _ t)) = Just t nodeToKey _ = Nothing -normalizeMetaValue :: PandocMonad m - => ParserT Text ParserState m (F MetaValue) +normalizeMetaValue :: (PandocMonad m, HasLastStrPosition st) + => ParserT Sources st m (Future st MetaValue) -> Text - -> ParserT Text ParserState m (F MetaValue) + -> ParserT Sources st m (Future st MetaValue) normalizeMetaValue pMetaValue x = -- Note: a standard quoted or unquoted YAML value will -- not end in a newline, but a "block" set off with @@ -129,10 +139,10 @@ checkBoolean t | t == T.pack "false" || t == T.pack "False" || t == T.pack "FALSE" = Just False | otherwise = Nothing -yamlToMetaValue :: PandocMonad m - => ParserT Text ParserState m (F MetaValue) +yamlToMetaValue :: (PandocMonad m, HasLastStrPosition st) + => ParserT Sources st m (Future st MetaValue) -> YAML.Node YE.Pos - -> ParserT Text ParserState m (F MetaValue) + -> ParserT Sources st m (Future st MetaValue) yamlToMetaValue pMetaValue (YAML.Scalar _ x) = case x of YAML.SStr t -> normalizeMetaValue pMetaValue t @@ -152,10 +162,10 @@ yamlToMetaValue pMetaValue (YAML.Mapping _ _ o) = fmap MetaMap <$> yamlMap pMetaValue o yamlToMetaValue _ _ = return $ return $ MetaString "" -yamlMap :: PandocMonad m - => ParserT Text ParserState m (F MetaValue) +yamlMap :: (PandocMonad m, HasLastStrPosition st) + => ParserT Sources st m (Future st MetaValue) -> M.Map (YAML.Node YE.Pos) (YAML.Node YE.Pos) - -> ParserT Text ParserState m (F (M.Map Text MetaValue)) + -> ParserT Sources st m (Future st (M.Map Text MetaValue)) yamlMap pMetaValue o = do kvs <- forM (M.toList o) $ \(key, v) -> do k <- maybe (throwError $ PandocParseError @@ -171,3 +181,20 @@ yamlMap pMetaValue o = do return $ do v' <- fv return (k, v') + +-- | Parse a YAML metadata block using the supplied 'MetaValue' parser. +yamlMetaBlock :: (HasLastStrPosition st, PandocMonad m) + => ParserT Sources st m (Future st MetaValue) + -> ParserT Sources st m (Future st Meta) +yamlMetaBlock parser = try $ do + string "---" + blankline + notFollowedBy blankline -- if --- is followed by a blank it's an HRULE + rawYamlLines <- manyTill anyLine stopLine + -- by including --- and ..., we allow yaml blocks with just comments: + let rawYaml = T.unlines ("---" : (rawYamlLines ++ ["..."])) + optional blanklines + yamlBsToMeta parser $ UTF8.fromTextLazy $ TL.fromStrict rawYaml + +stopLine :: Monad m => ParserT Sources st m () +stopLine = try $ (string "---" <|> string "...") >> blankline >> return () diff --git a/src/Text/Pandoc/Readers/Muse.hs b/src/Text/Pandoc/Readers/Muse.hs index b4eea9d3a..a0d4534f1 100644 --- a/src/Text/Pandoc/Readers/Muse.hs +++ b/src/Text/Pandoc/Readers/Muse.hs @@ -36,19 +36,20 @@ import Text.Pandoc.Definition import Text.Pandoc.Error (PandocError (PandocParsecError)) import Text.Pandoc.Logging import Text.Pandoc.Options -import Text.Pandoc.Parsing hiding (F) -import Text.Pandoc.Shared (crFilter, trimr, tshow) +import Text.Pandoc.Parsing +import Text.Pandoc.Shared (trimr, tshow) -- | Read Muse from an input string and return a Pandoc document. -readMuse :: PandocMonad m +readMuse :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc readMuse opts s = do - let input = crFilter s - res <- flip runReaderT def $ runParserT parseMuse def{ museOptions = opts } "source" input + let sources = toSources s + res <- flip runReaderT def $ runParserT parseMuse def{ museOptions = opts } + (initialSourceName sources) sources case res of - Left e -> throwError $ PandocParsecError input e + Left e -> throwError $ PandocParsecError sources e Right d -> return d type F = Future MuseState @@ -82,7 +83,7 @@ instance Default MuseEnv where , museInPara = False } -type MuseParser m = ParserT Text MuseState (ReaderT MuseEnv m) +type MuseParser m = ParserT Sources MuseState (ReaderT MuseEnv m) instance HasReaderOptions MuseState where extractReaderOptions = museOptions @@ -155,7 +156,7 @@ firstColumn = getPosition >>= \pos -> guard (sourceColumn pos == 1) -- * Parsers -- | Parse end-of-line, which can be either a newline or end-of-file. -eol :: Stream s m Char => ParserT s st m () +eol :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m () eol = void newline <|> eof getIndent :: PandocMonad m diff --git a/src/Text/Pandoc/Readers/Native.hs b/src/Text/Pandoc/Readers/Native.hs index 42843381a..58f235e81 100644 --- a/src/Text/Pandoc/Readers/Native.hs +++ b/src/Text/Pandoc/Readers/Native.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Native - Copyright : Copyright (C) 2011-2020 John MacFarlane + Copyright : Copyright (C) 2011-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -21,6 +21,7 @@ import Control.Monad.Except (throwError) import Data.Text (Text) import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Error +import Text.Pandoc.Sources (ToSources(..), sourcesToText) -- | Read native formatted text and return a Pandoc document. -- The input may be a full pandoc document, a block list, a block, @@ -32,14 +33,15 @@ import Text.Pandoc.Error -- -- > Pandoc nullMeta [Plain [Str "hi"]] -- -readNative :: PandocMonad m +readNative :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> a -> m Pandoc readNative _ s = - case maybe (Pandoc nullMeta <$> readBlocks s) Right (safeRead s) of - Right doc -> return doc - Left _ -> throwError $ PandocParseError "couldn't read native" + let t = sourcesToText . toSources $ s + in case maybe (Pandoc nullMeta <$> readBlocks t) Right (safeRead t) of + Right doc -> return doc + Left _ -> throwError $ PandocParseError "couldn't read native" readBlocks :: Text -> Either PandocError [Block] readBlocks s = maybe ((:[]) <$> readBlock s) Right (safeRead s) diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs index 903cdf4a1..668c9ca11 100644 --- a/src/Text/Pandoc/Readers/OPML.hs +++ b/src/Text/Pandoc/Readers/OPML.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.OPML - Copyright : Copyright (C) 2013-2020 John MacFarlane + Copyright : Copyright (C) 2013-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -13,20 +13,21 @@ Conversion of OPML to 'Pandoc' document. module Text.Pandoc.Readers.OPML ( readOPML ) where import Control.Monad.State.Strict -import Data.Char (toUpper) import Data.Default -import Data.Generics import Data.Maybe (fromMaybe) import Data.Text (Text) import qualified Data.Text as T -import Text.HTML.TagSoup.Entity (lookupEntity) +import qualified Data.Text.Lazy as TL import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Options +import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Readers.Markdown (readMarkdown) -import Text.Pandoc.Shared (crFilter, blocksToInlines') -import Text.XML.Light +import Text.Pandoc.Shared (blocksToInlines') +import Text.Pandoc.Sources (ToSources(..), sourcesToText) +import Text.Pandoc.XML.Light +import Control.Monad.Except (throwError) type OPML m = StateT OPMLState m @@ -46,42 +47,27 @@ instance Default OPMLState where , opmlOptions = def } -readOPML :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readOPML :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc readOPML opts inp = do - (bs, st') <- runStateT - (mapM parseBlock $ normalizeTree $ - parseXML (T.unpack (crFilter inp))) def{ opmlOptions = opts } + let sources = toSources inp + (bs, st') <- + runStateT (case parseXMLContents (TL.fromStrict . sourcesToText $ sources) of + Left msg -> throwError $ PandocXMLError "" msg + Right ns -> mapM parseBlock ns) + def{ opmlOptions = opts } return $ setTitle (opmlDocTitle st') $ setAuthors (opmlDocAuthors st') $ setDate (opmlDocDate st') $ doc $ mconcat bs --- normalize input, consolidating adjacent Text and CRef elements -normalizeTree :: [Content] -> [Content] -normalizeTree = everywhere (mkT go) - where go :: [Content] -> [Content] - go (Text (CData CDataRaw _ _):xs) = xs - go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs - go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs - go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs - go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs - go xs = xs - -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) - -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr elt = - maybe "" T.pack (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) - -textContent :: Element -> Text -textContent = T.pack . strContent + fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) -- exceptT :: PandocMonad m => Either PandocError a -> OPML m a -- exceptT = either throwError return @@ -105,11 +91,11 @@ parseBlock :: PandocMonad m => Content -> OPML m Blocks parseBlock (Elem e) = case qName (elName e) of "ownerName" -> mempty <$ modify (\st -> - st{opmlDocAuthors = [text $ textContent e]}) + st{opmlDocAuthors = [text $ strContent e]}) "dateModified" -> mempty <$ modify (\st -> - st{opmlDocDate = text $ textContent e}) + st{opmlDocDate = text $ strContent e}) "title" -> mempty <$ modify (\st -> - st{opmlDocTitle = text $ textContent e}) + st{opmlDocTitle = text $ strContent e}) "outline" -> gets opmlSectionLevel >>= sect . (+1) "?xml" -> return mempty _ -> getBlocks e diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs index 7b8bfd4b5..c274b6fd4 100644 --- a/src/Text/Pandoc/Readers/Odt.hs +++ b/src/Text/Pandoc/Readers/Odt.hs @@ -1,4 +1,3 @@ -{-# LANGUAGE PatternGuards #-} {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Reader.Odt @@ -15,7 +14,7 @@ Entry point to the odt reader. module Text.Pandoc.Readers.Odt ( readOdt ) where import Codec.Archive.Zip -import qualified Text.XML.Light as XML +import Text.Pandoc.XML.Light import qualified Data.ByteString.Lazy as B @@ -23,6 +22,8 @@ import System.FilePath import Control.Monad.Except (throwError) +import qualified Data.Text as T + import Text.Pandoc.Class.PandocMonad (PandocMonad) import qualified Text.Pandoc.Class.PandocMonad as P import Text.Pandoc.Definition @@ -60,35 +61,37 @@ readOdt' _ bytes = bytesToOdt bytes-- of bytesToOdt :: B.ByteString -> Either PandocError (Pandoc, MediaBag) bytesToOdt bytes = case toArchiveOrFail bytes of Right archive -> archiveToOdt archive - Left _ -> Left $ PandocParseError "Couldn't parse odt file." + Left err -> Left $ PandocParseError + $ "Could not unzip ODT: " <> T.pack err -- archiveToOdt :: Archive -> Either PandocError (Pandoc, MediaBag) -archiveToOdt archive - | Just contentEntry <- findEntryByPath "content.xml" archive - , Just stylesEntry <- findEntryByPath "styles.xml" archive - , Just contentElem <- entryToXmlElem contentEntry - , Just stylesElem <- entryToXmlElem stylesEntry - , Right styles <- chooseMax (readStylesAt stylesElem ) - (readStylesAt contentElem) - , media <- filteredFilesFromArchive archive filePathIsOdtMedia - , startState <- readerState styles media - , Right pandocWithMedia <- runConverter' read_body - startState - contentElem - - = Right pandocWithMedia - - | otherwise - -- Not very detailed, but I don't think more information would be helpful - = Left $ PandocParseError "Couldn't parse odt file." - where - filePathIsOdtMedia :: FilePath -> Bool +archiveToOdt archive = do + let onFailure msg Nothing = Left $ PandocParseError msg + onFailure _ (Just x) = Right x + contentEntry <- onFailure "Could not find content.xml" + (findEntryByPath "content.xml" archive) + stylesEntry <- onFailure "Could not find styles.xml" + (findEntryByPath "styles.xml" archive) + contentElem <- entryToXmlElem contentEntry + stylesElem <- entryToXmlElem stylesEntry + styles <- either + (\_ -> Left $ PandocParseError "Could not read styles") + Right + (chooseMax (readStylesAt stylesElem ) (readStylesAt contentElem)) + let filePathIsOdtMedia :: FilePath -> Bool filePathIsOdtMedia fp = let (dir, name) = splitFileName fp in (dir == "Pictures/") || (dir /= "./" && name == "content.xml") + let media = filteredFilesFromArchive archive filePathIsOdtMedia + let startState = readerState styles media + either (\_ -> Left $ PandocParseError "Could not convert opendocument") Right + (runConverter' read_body startState contentElem) -- -entryToXmlElem :: Entry -> Maybe XML.Element -entryToXmlElem = XML.parseXMLDoc . UTF8.toStringLazy . fromEntry +entryToXmlElem :: Entry -> Either PandocError Element +entryToXmlElem entry = + case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of + Right x -> Right x + Left msg -> Left $ PandocXMLError (T.pack $ eRelativePath entry) msg diff --git a/src/Text/Pandoc/Readers/Odt/Arrows/State.hs b/src/Text/Pandoc/Readers/Odt/Arrows/State.hs index 93c6b5e79..96515bf56 100644 --- a/src/Text/Pandoc/Readers/Odt/Arrows/State.hs +++ b/src/Text/Pandoc/Readers/Odt/Arrows/State.hs @@ -22,7 +22,7 @@ module Text.Pandoc.Readers.Odt.Arrows.State where import Control.Arrow import qualified Control.Category as Cat import Control.Monad - +import Data.List (foldl') import Text.Pandoc.Readers.Odt.Arrows.Utils import Text.Pandoc.Readers.Odt.Generic.Fallible @@ -122,7 +122,7 @@ iterateS a = ArrowState $ \(s,f) -> foldr a' (s,mzero) f iterateSL :: (Foldable f, MonadPlus m) => ArrowState s x y -> ArrowState s (f x) (m y) -iterateSL a = ArrowState $ \(s,f) -> foldl a' (s,mzero) f +iterateSL a = ArrowState $ \(s,f) -> foldl' a' (s,mzero) f where a' (s',m) x = second (mplus m.return) $ runArrowState a (s',x) diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs index 43c44e7e9..5520d039f 100644 --- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -1,4 +1,5 @@ {-# LANGUAGE Arrows #-} +{-# LANGUAGE CPP #-} {-# LANGUAGE DeriveFoldable #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE PatternGuards #-} @@ -29,14 +30,14 @@ import Control.Monad ((<=<)) import qualified Data.ByteString.Lazy as B import Data.Foldable (fold) -import Data.List (find, stripPrefix) +import Data.List (find) import qualified Data.Map as M import qualified Data.Text as T import Data.Maybe -import Data.Semigroup (First(..), Option(..)) +import Data.Monoid (Alt (..)) import Text.TeXMath (readMathML, writeTeX) -import qualified Text.XML.Light as XML +import qualified Text.Pandoc.XML.Light as XML import Text.Pandoc.Builder hiding (underline) import Text.Pandoc.MediaBag (MediaBag, insertMedia) @@ -505,13 +506,11 @@ type InlineMatcher = ElementMatcher Inlines type BlockMatcher = ElementMatcher Blocks - -newtype FirstMatch a = FirstMatch (Option (First a)) - deriving (Foldable, Monoid, Semigroup) +newtype FirstMatch a = FirstMatch (Alt Maybe a) + deriving (Foldable, Monoid, Semigroup) firstMatch :: a -> FirstMatch a -firstMatch = FirstMatch . Option . Just . First - +firstMatch = FirstMatch . Alt . Just -- matchingElement :: (Monoid e) @@ -557,7 +556,7 @@ read_plain_text = fst ^&&& read_plain_text' >>% recover >>?% mappend -- extractText :: XML.Content -> Fallible T.Text - extractText (XML.Text cData) = succeedWith (T.pack $ XML.cdData cData) + extractText (XML.Text cData) = succeedWith (XML.cdData cData) extractText _ = failEmpty read_text_seq :: InlineMatcher @@ -577,7 +576,10 @@ read_spaces = matchingElement NsText "s" ( read_line_break :: InlineMatcher read_line_break = matchingElement NsText "line-break" $ returnV linebreak - +-- +read_tab :: InlineMatcher +read_tab = matchingElement NsText "tab" + $ returnV space -- read_span :: InlineMatcher read_span = matchingElement NsText "span" @@ -585,6 +587,7 @@ read_span = matchingElement NsText "span" $ matchChildContent [ read_span , read_spaces , read_line_break + , read_tab , read_link , read_note , read_citation @@ -604,6 +607,7 @@ read_paragraph = matchingElement NsText "p" $ matchChildContent [ read_span , read_spaces , read_line_break + , read_tab , read_link , read_note , read_citation @@ -630,6 +634,7 @@ read_header = matchingElement NsText "h" children <- ( matchChildContent [ read_span , read_spaces , read_line_break + , read_tab , read_link , read_note , read_citation @@ -777,14 +782,14 @@ read_frame_img = "" -> returnV mempty -< () src' -> do let exts = extensionsFromList [Ext_auto_identifiers] - resource <- lookupResource -< src' + resource <- lookupResource -< T.unpack src' _ <- updateMediaWithResource -< resource w <- findAttrText' NsSVG "width" -< () h <- findAttrText' NsSVG "height" -< () titleNodes <- matchChildContent' [ read_frame_title ] -< () alt <- matchChildContent [] read_plain_text -< () arr (firstMatch . uncurry4 imageWith) -< - (image_attributes w h, T.pack src', inlineListToIdentifier exts (toList titleNodes), alt) + (image_attributes w h, src', inlineListToIdentifier exts (toList titleNodes), alt) read_frame_title :: InlineMatcher read_frame_title = matchingElement NsSVG "title" (matchChildContent [] read_plain_text) @@ -804,7 +809,8 @@ read_frame_mathml = case fold src of "" -> returnV mempty -< () src' -> do - let path = fromMaybe src' (stripPrefix "./" src') ++ "/content.xml" + let path = T.unpack $ + fromMaybe src' (T.stripPrefix "./" src') <> "/content.xml" (_, mathml) <- lookupResource -< path case readMathML (UTF8.toText $ B.toStrict mathml) of Left _ -> returnV mempty -< () diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs index 77174c793..78a7fc0b2 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs @@ -14,9 +14,10 @@ typesafe Haskell namespace identifiers and unsafe "real world" namespaces. module Text.Pandoc.Readers.Odt.Generic.Namespaces where import qualified Data.Map as M +import Data.Text (Text) -- -type NameSpaceIRI = String +type NameSpaceIRI = Text -- type NameSpaceIRIs nsID = M.Map nsID NameSpaceIRI diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs index 6dc56a0d9..edefe3c70 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs @@ -20,7 +20,6 @@ module Text.Pandoc.Readers.Odt.Generic.Utils , reverseComposition , tryToRead , Lookupable(..) -, readLookupables , readLookupable , readPercent , findBy @@ -30,11 +29,11 @@ module Text.Pandoc.Readers.Odt.Generic.Utils import Control.Category (Category, (<<<), (>>>)) import qualified Control.Category as Cat (id) -import Control.Monad (msum) - +import Data.Char (isSpace) import qualified Data.Foldable as F (Foldable, foldr) import Data.Maybe - +import Data.Text (Text) +import qualified Data.Text as T -- | Equivalent to -- > foldr (.) id @@ -76,8 +75,8 @@ swing = flip.(.flip id) -- (nobody wants that) while the latter returns "to much" for simple purposes. -- This function instead applies 'reads' and returns the first match (if any) -- in a 'Maybe'. -tryToRead :: (Read r) => String -> Maybe r -tryToRead = reads >>> listToMaybe >>> fmap fst +tryToRead :: (Read r) => Text -> Maybe r +tryToRead = (reads . T.unpack) >>> listToMaybe >>> fmap fst -- | A version of 'reads' that requires a '%' sign after the number readPercent :: ReadS Int @@ -88,26 +87,12 @@ readPercent s = [ (i,s') | (i , r ) <- reads s -- | Data that can be looked up. -- This is mostly a utility to read data with kind *. class Lookupable a where - lookupTable :: [(String, a)] - --- | The idea is to use this function as if there was a declaration like --- --- > instance (Lookupable a) => (Read a) where --- > readsPrec _ = readLookupables --- . --- But including this code in this form would need UndecideableInstances. --- That is a bad idea. Luckily 'readLookupable' (without the s at the end) --- can be used directly in almost any case. -readLookupables :: (Lookupable a) => String -> [(a,String)] -readLookupables s = [ (a,rest) | (word,rest) <- lex s, - a <- maybeToList (lookup word lookupTable) - ] + lookupTable :: [(Text, a)] -- | Very similar to a simple 'lookup' in the 'lookupTable', but with a lexer. -readLookupable :: (Lookupable a) => String -> Maybe a -readLookupable s = msum - $ map ((`lookup` lookupTable).fst) - $ lex s +readLookupable :: (Lookupable a) => Text -> Maybe a +readLookupable s = + lookup (T.takeWhile (not . isSpace) $ T.dropWhile isSpace s) lookupTable uncurry3 :: (a->b->c -> z) -> (a,b,c ) -> z uncurry4 :: (a->b->c->d -> z) -> (a,b,c,d ) -> z diff --git a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs index 00c636a0d..341903046 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE LambdaCase #-} @@ -60,15 +61,15 @@ import Control.Arrow import Data.Bool ( bool ) import Data.Either ( rights ) import qualified Data.Map as M -import qualified Data.Text as T +import Data.Text (Text) import Data.Default import Data.Maybe +import Data.List (foldl') -import qualified Text.XML.Light as XML +import qualified Text.Pandoc.XML.Light as XML import Text.Pandoc.Readers.Odt.Arrows.State import Text.Pandoc.Readers.Odt.Arrows.Utils - import Text.Pandoc.Readers.Odt.Generic.Namespaces import Text.Pandoc.Readers.Odt.Generic.Utils import Text.Pandoc.Readers.Odt.Generic.Fallible @@ -78,13 +79,13 @@ import Text.Pandoc.Readers.Odt.Generic.Fallible -------------------------------------------------------------------------------- -- -type ElementName = String -type AttributeName = String -type AttributeValue = String -type TextAttributeValue = T.Text +type ElementName = Text +type AttributeName = Text +type AttributeValue = Text +type TextAttributeValue = Text -- -type NameSpacePrefix = String +type NameSpacePrefix = Text -- type NameSpacePrefixes nsID = M.Map nsID NameSpacePrefix @@ -292,7 +293,7 @@ readNSattributes = fromState $ \state -> maybe (state, failEmpty ) => XMLConverterState nsID extraState -> Maybe (XMLConverterState nsID extraState) extractNSAttrs startState - = foldl (\state d -> state >>= addNS d) + = foldl' (\state d -> state >>= addNS d) (Just startState) nsAttribs where nsAttribs = mapMaybe readNSattr (XML.elAttribs element) @@ -461,7 +462,7 @@ lookupDefaultingAttr :: (NameSpaceID nsID, Lookupable a, Default a) lookupDefaultingAttr nsID attrName = lookupAttrWithDefault nsID attrName def --- | Return value as a (Maybe String) +-- | Return value as a (Maybe Text) findAttr' :: (NameSpaceID nsID) => nsID -> AttributeName -> XMLConverter nsID extraState x (Maybe AttributeValue) @@ -477,7 +478,6 @@ findAttrText' nsID attrName = qualifyName nsID attrName &&& getCurrentElement >>% XML.findAttr - >>^ fmap T.pack -- | Return value as string or fail findAttr :: (NameSpaceID nsID) @@ -492,7 +492,6 @@ findAttrText :: (NameSpaceID nsID) -> FallibleXMLConverter nsID extraState x TextAttributeValue findAttrText nsID attrName = findAttr' nsID attrName - >>^ fmap T.pack >>> maybeToChoice -- | Return value as string or return provided default value @@ -511,7 +510,7 @@ findAttrTextWithDefault :: (NameSpaceID nsID) -> XMLConverter nsID extraState x TextAttributeValue findAttrTextWithDefault nsID attrName deflt = findAttr' nsID attrName - >>^ maybe deflt T.pack + >>^ fromMaybe deflt -- | Read and return value or fail readAttr :: (NameSpaceID nsID, Read attrValue) @@ -748,7 +747,7 @@ matchContent lookups fallback -- Internals -------------------------------------------------------------------------------- -stringToBool' :: String -> Maybe Bool +stringToBool' :: Text -> Maybe Bool stringToBool' val | val `elem` trueValues = Just True | val `elem` falseValues = Just False | otherwise = Nothing diff --git a/src/Text/Pandoc/Readers/Odt/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Namespaces.hs index 3a24a1162..70741c28d 100644 --- a/src/Text/Pandoc/Readers/Odt/Namespaces.hs +++ b/src/Text/Pandoc/Readers/Odt/Namespaces.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Reader.Odt.Namespaces Copyright : Copyright (C) 2015 Martin Linnemann @@ -13,10 +14,10 @@ Namespaces used in odt files. module Text.Pandoc.Readers.Odt.Namespaces ( Namespace (..) ) where -import Data.List (isPrefixOf) import qualified Data.Map as M (empty, insert) import Data.Maybe (fromMaybe, listToMaybe) - +import Data.Text (Text) +import qualified Data.Text as T import Text.Pandoc.Readers.Odt.Generic.Namespaces @@ -30,7 +31,7 @@ instance NameSpaceID Namespace where findID :: NameSpaceIRI -> Maybe Namespace -findID iri = listToMaybe [nsID | (iri',nsID) <- nsIDs, iri' `isPrefixOf` iri] +findID iri = listToMaybe [nsID | (iri',nsID) <- nsIDs, iri' `T.isPrefixOf` iri] nsIDmap :: NameSpaceIRIs Namespace nsIDmap = foldr (uncurry $ flip M.insert) M.empty nsIDs @@ -54,12 +55,12 @@ data Namespace = -- Open Document core -- Core XML (basically only for the 'id'-attribute) | NsXML -- Fallback - | NsOther String + | NsOther Text deriving ( Eq, Ord, Show ) -- | Not the actual iri's, but large prefixes of them - this way there are -- less versioning problems and the like. -nsIDs :: [(String,Namespace)] +nsIDs :: [(Text, Namespace)] nsIDs = [ ("urn:oasis:names:tc:opendocument:xmlns:animation" , NsAnim ), ("urn:oasis:names:tc:opendocument:xmlns:chart" , NsChart ), diff --git a/src/Text/Pandoc/Readers/Odt/StyleReader.hs b/src/Text/Pandoc/Readers/Odt/StyleReader.hs index 46a777df1..ca791ad1e 100644 --- a/src/Text/Pandoc/Readers/Odt/StyleReader.hs +++ b/src/Text/Pandoc/Readers/Odt/StyleReader.hs @@ -2,6 +2,7 @@ {-# LANGUAGE Arrows #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TupleSections #-} +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Odt.StyleReader Copyright : Copyright (C) 2015 Martin Linnemann @@ -43,14 +44,16 @@ import Control.Arrow import Data.Default import qualified Data.Foldable as F -import Data.List (unfoldr) +import Data.List (unfoldr, foldl') import qualified Data.Map as M import Data.Maybe +import Data.Text (Text) +import qualified Data.Text as T import qualified Data.Set as S -import qualified Text.XML.Light as XML +import qualified Text.Pandoc.XML.Light as XML -import Text.Pandoc.Shared (safeRead) +import Text.Pandoc.Shared (safeRead, tshow) import Text.Pandoc.Readers.Odt.Arrows.Utils @@ -90,7 +93,7 @@ instance Default FontPitch where -- -- Thus, we want -type FontFaceName = String +type FontFaceName = Text type FontPitches = M.Map FontFaceName FontPitch @@ -117,7 +120,7 @@ fontPitchReader = executeInSub NsOffice "font-face-decls" ( &&& lookupDefaultingAttr NsStyle "font-pitch" )) - >>?^ ( M.fromList . foldl accumLegalPitches [] ) + >>?^ ( M.fromList . foldl' accumLegalPitches [] ) ) `ifFailedDo` returnV (Right M.empty) where accumLegalPitches ls (Nothing,_) = ls accumLegalPitches ls (Just n,p) = (n,p):ls @@ -151,7 +154,7 @@ findPitch = ( lookupAttr NsStyle "font-pitch" -- Definitions of main data -------------------------------------------------------------------------------- -type StyleName = String +type StyleName = Text -- | There are two types of styles: named styles with a style family and an -- optional style parent, and default styles for each style family, @@ -355,8 +358,8 @@ getListLevelStyle level ListStyle{..} = -- \^ simpler, but in general less efficient data ListLevelStyle = ListLevelStyle { listLevelType :: ListLevelType - , listItemPrefix :: Maybe String - , listItemSuffix :: Maybe String + , listItemPrefix :: Maybe Text + , listItemSuffix :: Maybe Text , listItemFormat :: ListItemNumberFormat , listItemStart :: Int } @@ -366,9 +369,9 @@ instance Show ListLevelStyle where show ListLevelStyle{..} = "<LLS|" ++ show listLevelType ++ "|" - ++ maybeToString listItemPrefix + ++ maybeToString (T.unpack <$> listItemPrefix) ++ show listItemFormat - ++ maybeToString listItemSuffix + ++ maybeToString (T.unpack <$> listItemSuffix) ++ ">" where maybeToString = fromMaybe "" @@ -471,7 +474,7 @@ readTextProperties = ) where isFontEmphasised = [("normal",False),("italic",True),("oblique",True)] isFontBold = ("normal",False):("bold",True) - :map ((,True).show) ([100,200..900]::[Int]) + :map ((,True) . tshow) ([100,200..900]::[Int]) readUnderlineMode :: StyleReaderSafe _x (Maybe UnderlineMode) readUnderlineMode = readLineMode "text-underline-mode" @@ -481,7 +484,7 @@ readStrikeThroughMode :: StyleReaderSafe _x (Maybe UnderlineMode) readStrikeThroughMode = readLineMode "text-line-through-mode" "text-line-through-style" -readLineMode :: String -> String -> StyleReaderSafe _x (Maybe UnderlineMode) +readLineMode :: Text -> Text -> StyleReaderSafe _x (Maybe UnderlineMode) readLineMode modeAttr styleAttr = proc x -> do isUL <- searchAttr NsStyle styleAttr False isLinePresent -< x mode <- lookupAttr' NsStyle modeAttr -< x @@ -560,12 +563,13 @@ readListLevelStyle levelType = readAttr NsText "level" -- chooseMostSpecificListLevelStyle :: S.Set ListLevelStyle -> Maybe ListLevelStyle -chooseMostSpecificListLevelStyle ls | ls == mempty = Nothing - | otherwise = Just ( F.foldr1 select ls ) +chooseMostSpecificListLevelStyle ls = F.foldr select Nothing ls where - select ( ListLevelStyle t1 p1 s1 f1 b1 ) - ( ListLevelStyle t2 p2 s2 f2 _ ) - = ListLevelStyle (select' t1 t2) (p1 <|> p2) (s1 <|> s2) (selectLinf f1 f2) b1 + select l Nothing = Just l + select ( ListLevelStyle t1 p1 s1 f1 b1 ) + ( Just ( ListLevelStyle t2 p2 s2 f2 _ )) + = Just $ ListLevelStyle (select' t1 t2) (p1 <|> p2) (s1 <|> s2) + (selectLinf f1 f2) b1 select' LltNumbered _ = LltNumbered select' _ LltNumbered = LltNumbered select' _ _ = LltBullet diff --git a/src/Text/Pandoc/Readers/Org.hs b/src/Text/Pandoc/Readers/Org.hs index 851aec103..8823befdd 100644 --- a/src/Text/Pandoc/Readers/Org.hs +++ b/src/Text/Pandoc/Readers/Org.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -18,22 +18,19 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Parsing (reportLogMessages) -import Text.Pandoc.Shared (crFilter) - +import Text.Pandoc.Sources (ToSources(..), ensureFinalNewlines) import Control.Monad.Except (throwError) import Control.Monad.Reader (runReaderT) -import Data.Text (Text) - -- | Parse org-mode string and return a Pandoc document. -readOrg :: PandocMonad m +readOrg :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> a -> m Pandoc readOrg opts s = do parsed <- flip runReaderT def $ readWithM parseOrg (optionsToParserState opts) - (crFilter s <> "\n\n") + (ensureFinalNewlines 2 (toSources s)) case parsed of Right result -> return result Left e -> throwError e diff --git a/src/Text/Pandoc/Readers/Org/BlockStarts.hs b/src/Text/Pandoc/Readers/Org/BlockStarts.hs index 8f7cac6ea..14233569c 100644 --- a/src/Text/Pandoc/Readers/Org/BlockStarts.hs +++ b/src/Text/Pandoc/Readers/Org/BlockStarts.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org.BlockStarts - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Readers/Org/Blocks.hs b/src/Text/Pandoc/Readers/Org/Blocks.hs index f2e8b1ab6..f18d2f9a7 100644 --- a/src/Text/Pandoc/Readers/Org/Blocks.hs +++ b/src/Text/Pandoc/Readers/Org/Blocks.hs @@ -1,9 +1,10 @@ {-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RecordWildCards #-} {- | Module : Text.Pandoc.Readers.Org.Blocks - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -38,10 +39,12 @@ import Data.Functor (($>)) import Data.List (foldl', intersperse) import Data.Maybe (fromMaybe, isJust, isNothing) import Data.Text (Text) - +import Data.List.NonEmpty (nonEmpty) +import System.FilePath import qualified Data.Text as T import qualified Text.Pandoc.Builder as B import qualified Text.Pandoc.Walk as Walk +import Text.Pandoc.Sources (ToSources(..)) -- -- parsing blocks @@ -294,24 +297,22 @@ verseBlock blockType = try $ do codeBlock :: PandocMonad m => BlockAttributes -> Text -> OrgParser m (F Blocks) codeBlock blockAttrs blockType = do skipSpaces - (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) - content <- rawBlockContent blockType - resultsContent <- option mempty babelResultsBlock - let id' = fromMaybe mempty $ blockAttrName blockAttrs - let codeBlck = B.codeBlockWith ( id', classes, kv ) content - let labelledBlck = maybe (pure codeBlck) - (labelDiv codeBlck) - (blockAttrCaption blockAttrs) + (classes, kv) <- codeHeaderArgs <|> (mempty <$ ignHeaders) + content <- rawBlockContent blockType + resultsContent <- option mempty babelResultsBlock + let identifier = fromMaybe mempty $ blockAttrName blockAttrs + let codeBlk = B.codeBlockWith (identifier, classes, kv) content + let wrap = maybe pure addCaption (blockAttrCaption blockAttrs) return $ - (if exportsCode kv then labelledBlck else mempty) <> + (if exportsCode kv then wrap codeBlk else mempty) <> (if exportsResults kv then resultsContent else mempty) where - labelDiv :: Blocks -> F Inlines -> F Blocks - labelDiv blk value = - B.divWith nullAttr <$> (mappend <$> labelledBlock value <*> pure blk) + addCaption :: F Inlines -> Blocks -> F Blocks + addCaption caption blk = B.divWith ("", ["captioned-content"], []) + <$> (mkCaptionBlock caption <> pure blk) - labelledBlock :: F Inlines -> F Blocks - labelledBlock = fmap (B.plain . B.spanWith ("", ["label"], [])) + mkCaptionBlock :: F Inlines -> F Blocks + mkCaptionBlock = fmap (B.divWith ("", ["caption"], []) . B.plain) exportsResults :: [(Text, Text)] -> Bool exportsResults = maybe False (`elem` ["results", "both"]) . lookup "exports" @@ -527,7 +528,9 @@ include = try $ do _ -> nullAttr return $ pure . B.codeBlockWith attr <$> parseRaw _ -> return $ return . B.fromList . blockFilter params <$> blockList - insertIncludedFileF blocksParser ["."] filename + currentDir <- takeDirectory . sourceName <$> getPosition + insertIncludedFile blocksParser toSources + [currentDir] filename Nothing Nothing where includeTarget :: PandocMonad m => OrgParser m FilePath includeTarget = do @@ -543,8 +546,7 @@ include = try $ do in case (minlvl >>= safeRead :: Maybe Int) of Nothing -> blks Just lvl -> let levels = Walk.query headerLevel blks - -- CAVE: partial function in else - curMin = if null levels then 0 else minimum levels + curMin = maybe 0 minimum $ nonEmpty levels in Walk.walk (shiftHeader (curMin - lvl)) blks headerLevel :: Block -> [Int] @@ -852,16 +854,52 @@ definitionListItem parseIndentedMarker = try $ do definitionMarker = spaceChar *> string "::" <* (spaceChar <|> lookAhead newline) +-- | Checkbox for tasks. +data Checkbox + = UncheckedBox + | CheckedBox + | SemicheckedBox + +-- | Parses a checkbox in a plain list. +checkbox :: PandocMonad m + => OrgParser m Checkbox +checkbox = do + guardEnabled Ext_task_lists + try (char '[' *> status <* char ']') <?> "checkbox" + where + status = choice + [ UncheckedBox <$ char ' ' + , CheckedBox <$ char 'X' + , SemicheckedBox <$ char '-' + ] + +checkboxToInlines :: Checkbox -> Inline +checkboxToInlines = B.Str . \case + UncheckedBox -> "☐" + SemicheckedBox -> "☐" + CheckedBox -> "☒" + -- | parse raw text for one list item listItem :: PandocMonad m => OrgParser m Int -> OrgParser m (F Blocks) listItem parseIndentedMarker = try . withContext ListItemState $ do markerLength <- try parseIndentedMarker + box <- optionMaybe checkbox firstLine <- anyLineNewline blank <- option "" ("\n" <$ blankline) rest <- T.concat <$> many (listContinuation markerLength) - parseFromString blocks $ firstLine <> blank <> rest + contents <- parseFromString blocks $ firstLine <> blank <> rest + return (maybe id (prependInlines . checkboxToInlines) box <$> contents) + +-- | Prepend inlines to blocks, adding them to the first paragraph or +-- creating a new Plain element if necessary. +prependInlines :: Inline -> Blocks -> Blocks +prependInlines inlns = B.fromList . prepend . B.toList + where + prepend (Plain is : bs) = Plain (inlns : Space : is) : bs + prepend (Para is : bs) = Para (inlns : Space : is) : bs + prepend bs = Plain [inlns, Space] : bs -- continuation of a list item - indented and separated by blankline or endline. -- Note: nested lists are parsed as continuations. diff --git a/src/Text/Pandoc/Readers/Org/DocumentTree.hs b/src/Text/Pandoc/Readers/Org/DocumentTree.hs index 3b363270c..2dcbecb1d 100644 --- a/src/Text/Pandoc/Readers/Org/DocumentTree.hs +++ b/src/Text/Pandoc/Readers/Org/DocumentTree.hs @@ -3,7 +3,7 @@ {-# LANGUAGE TupleSections #-} {- | Module : Text.Pandoc.Readers.Org.DocumentTree - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Readers/Org/ExportSettings.hs b/src/Text/Pandoc/Readers/Org/ExportSettings.hs index 9399ebd54..401e1bd8f 100644 --- a/src/Text/Pandoc/Readers/Org/ExportSettings.hs +++ b/src/Text/Pandoc/Readers/Org/ExportSettings.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org.ExportSettings - Copyright : © 2016–2020 Albert Krewinkel + Copyright : © 2016-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs index b234bee58..6862dd71e 100644 --- a/src/Text/Pandoc/Readers/Org/Inlines.hs +++ b/src/Text/Pandoc/Readers/Org/Inlines.hs @@ -2,7 +2,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org.Inlines - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -29,6 +29,7 @@ import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) import Text.TeXMath (DisplayType (..), readTeX, writePandoc) +import Text.Pandoc.Sources (ToSources(..)) import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap import Control.Monad (guard, mplus, mzero, unless, void, when) @@ -262,7 +263,7 @@ berkeleyCitationList = try $ do where citationListPart :: PandocMonad m => OrgParser m (F Inlines) citationListPart = fmap (trimInlinesF . mconcat) . try . many1 $ do - notFollowedBy' citeKey + notFollowedBy' $ citeKey False notFollowedBy (oneOf ";]") inline @@ -277,7 +278,7 @@ berkeleyBareTag' = try $ void (string "cite") berkeleyTextualCite :: PandocMonad m => OrgParser m (F [Citation]) berkeleyTextualCite = try $ do - (suppressAuthor, key) <- citeKey + (suppressAuthor, key) <- citeKey False returnF . return $ Citation { citationId = key , citationPrefix = mempty @@ -322,7 +323,7 @@ linkLikeOrgRefCite = try $ do -- from the `org-ref-cite-re` variable in `org-ref.el`. orgRefCiteKey :: PandocMonad m => OrgParser m Text orgRefCiteKey = - let citeKeySpecialChars = "-_:\\./," :: String + let citeKeySpecialChars = "-_:\\./" :: String isCiteKeySpecialChar c = c `elem` citeKeySpecialChars isCiteKeyChar c = isAlphaNum c || isCiteKeySpecialChar c endOfCitation = try $ do @@ -350,7 +351,7 @@ citeList = sequence <$> sepEndBy1 citation (try $ char ';' *> skipSpaces) citation :: PandocMonad m => OrgParser m (F Citation) citation = try $ do pref <- prefix - (suppress_author, key) <- citeKey + (suppress_author, key) <- citeKey False suff <- suffix return $ do x <- pref @@ -367,7 +368,7 @@ citation = try $ do } where prefix = trimInlinesF . mconcat <$> - manyTill inline (char ']' <|> (']' <$ lookAhead citeKey)) + manyTill inline (char ']' <|> (']' <$ lookAhead (citeKey False))) suffix = try $ do hasSpace <- option False (notFollowedBy nonspaceChar >> return True) skipSpaces @@ -477,17 +478,17 @@ linkToInlinesF linkStr = internalLink :: Text -> Inlines -> F Inlines internalLink link title = do - anchorB <- (link `elem`) <$> asksF orgStateAnchorIds - if anchorB + ids <- asksF orgStateAnchorIds + if link `elem` ids then return $ B.link ("#" <> link) "" title - else return $ B.emph title + else let attr' = ("", ["spurious-link"] , [("target", link)]) + in return $ B.spanWith attr' (B.emph title) -- | Parse an anchor like @<<anchor-id>>@ and return an empty span with -- @anchor-id@ set as id. Legal anchors in org-mode are defined through -- @org-target-regexp@, which is fairly liberal. Since no link is created if -- @anchor-id@ contains spaces, we are more restrictive in what is accepted as -- an anchor. - anchor :: PandocMonad m => OrgParser m (F Inlines) anchor = try $ do anchorId <- parseAnchor @@ -501,7 +502,6 @@ anchor = try $ do -- | Replace every char but [a-zA-Z0-9_.-:] with a hyphen '-'. This mirrors -- the org function @org-export-solidify-link-text@. - solidify :: Text -> Text solidify = T.map replaceSpecialChar where replaceSpecialChar c @@ -573,7 +573,7 @@ underline :: PandocMonad m => OrgParser m (F Inlines) underline = fmap B.underline <$> emphasisBetween '_' verbatim :: PandocMonad m => OrgParser m (F Inlines) -verbatim = return . B.code <$> verbatimBetween '=' +verbatim = return . B.codeWith ("", ["verbatim"], []) <$> verbatimBetween '=' code :: PandocMonad m => OrgParser m (F Inlines) code = return . B.code <$> verbatimBetween '~' @@ -803,7 +803,7 @@ inlineLaTeX = try $ do parseAsInlineLaTeX :: PandocMonad m => Text -> TeXExport -> OrgParser m (Maybe Inlines) parseAsInlineLaTeX cs = \case - TeXExport -> maybeRight <$> runParserT inlineCommand state "" cs + TeXExport -> maybeRight <$> runParserT inlineCommand state "" (toSources cs) TeXIgnore -> return (Just mempty) TeXVerbatim -> return (Just $ B.str cs) diff --git a/src/Text/Pandoc/Readers/Org/Meta.hs b/src/Text/Pandoc/Readers/Org/Meta.hs index 4864d9478..a1b21046a 100644 --- a/src/Text/Pandoc/Readers/Org/Meta.hs +++ b/src/Text/Pandoc/Readers/Org/Meta.hs @@ -3,7 +3,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org.Meta - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -239,7 +239,7 @@ lineOfInlines = do todoSequence :: Monad m => OrgParser m TodoSequence todoSequence = try $ do todoKws <- todoKeywords - doneKws <- optionMaybe $ todoDoneSep *> todoKeywords + doneKws <- optionMaybe $ todoDoneSep *> doneKeywords newline -- There must be at least one DONE keyword. The last TODO keyword is -- taken if necessary. @@ -250,11 +250,17 @@ todoSequence = try $ do (x:xs) -> return $ keywordsToSequence (reverse xs) [x] where + todoKeyword :: Monad m => OrgParser m Text + todoKeyword = many1Char nonspaceChar <* skipSpaces + todoKeywords :: Monad m => OrgParser m [Text] todoKeywords = try $ - let keyword = many1Char nonspaceChar <* skipSpaces - endOfKeywords = todoDoneSep <|> void newline - in manyTill keyword (lookAhead endOfKeywords) + let endOfKeywords = todoDoneSep <|> void newline + in manyTill todoKeyword (lookAhead endOfKeywords) + + doneKeywords :: Monad m => OrgParser m [Text] + doneKeywords = try $ + manyTill (todoKeyword <* optional todoDoneSep) (lookAhead newline) todoDoneSep :: Monad m => OrgParser m () todoDoneSep = void . try $ skipSpaces *> char '|' <* skipSpaces1 diff --git a/src/Text/Pandoc/Readers/Org/ParserState.hs b/src/Text/Pandoc/Readers/Org/ParserState.hs index 1e4799e7b..abe8a9ebf 100644 --- a/src/Text/Pandoc/Readers/Org/ParserState.hs +++ b/src/Text/Pandoc/Readers/Org/ParserState.hs @@ -3,7 +3,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org.ParserState - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Readers/Org/Parsing.hs b/src/Text/Pandoc/Readers/Org/Parsing.hs index bce71c24d..f0949e205 100644 --- a/src/Text/Pandoc/Readers/Org/Parsing.hs +++ b/src/Text/Pandoc/Readers/Org/Parsing.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Readers.Org.Parsing - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -63,8 +63,7 @@ module Text.Pandoc.Readers.Org.Parsing , ellipses , citeKey , gridTableWith - , insertIncludedFileF - -- * Re-exports from Text.Pandoc.Parsec + , insertIncludedFile , runParser , runParserT , getInput @@ -100,21 +99,22 @@ module Text.Pandoc.Readers.Org.Parsing , getState , updateState , SourcePos + , sourceName , getPosition ) where import Data.Text (Text) import Text.Pandoc.Readers.Org.ParserState -import Text.Pandoc.Parsing hiding (F, anyLine, blanklines, newline, - parseFromString) +import Text.Pandoc.Parsing hiding (anyLine, blanklines, newline, + parseFromString) import qualified Text.Pandoc.Parsing as P import Control.Monad (guard) import Control.Monad.Reader (ReaderT) -- | The parser used to read org files. -type OrgParser m = ParserT Text OrgParserState (ReaderT OrgParserLocal m) +type OrgParser m = ParserT Sources OrgParserState (ReaderT OrgParserLocal m) -- -- Adaptions and specializations of parsing utilities diff --git a/src/Text/Pandoc/Readers/Org/Shared.hs b/src/Text/Pandoc/Readers/Org/Shared.hs index 7f72077a4..ad7c65060 100644 --- a/src/Text/Pandoc/Readers/Org/Shared.hs +++ b/src/Text/Pandoc/Readers/Org/Shared.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Org.Shared - Copyright : Copyright (C) 2014-2020 Albert Krewinkel + Copyright : Copyright (C) 2014-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index eeb3d1389..3990f0cb5 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -4,7 +4,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.RST - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -27,8 +27,7 @@ import Data.Text (Text) import qualified Data.Text as T import Text.Pandoc.Builder (Blocks, Inlines, fromList, setMeta, trimInlines) import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Class.PandocMonad (PandocMonad, fetchItem, - readFileFromDirs, getCurrentTime) +import Text.Pandoc.Class.PandocMonad (PandocMonad, fetchItem, getTimestamp) import Text.Pandoc.CSV (CSVOptions (..), defaultCSVOptions, parseCSV) import Text.Pandoc.Definition import Text.Pandoc.Error @@ -38,25 +37,25 @@ import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Shared import qualified Text.Pandoc.UTF8 as UTF8 -import Text.Printf (printf) import Data.Time.Format +import System.FilePath (takeDirectory) -- TODO: -- [ ] .. parsed-literal -- | Parse reStructuredText string and return Pandoc document. -readRST :: PandocMonad m +readRST :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ Text to parse (assuming @'\n'@ line endings) + -> a -> m Pandoc readRST opts s = do parsed <- readWithM parseRST def{ stateOptions = opts } - (crFilter s <> "\n\n") + (ensureFinalNewlines 2 (toSources s)) case parsed of Right result -> return result Left e -> throwError e -type RSTParser m = ParserT Text ParserState m +type RSTParser m = ParserT Sources ParserState m -- -- Constants and data structure definitions @@ -151,11 +150,19 @@ parseRST = do startPos <- getPosition -- go through once just to get list of reference keys and notes -- docMinusKeys is the raw document with blanks where the keys were... - docMinusKeys <- T.concat <$> - manyTill (referenceKey <|> anchorDef <|> - noteBlock <|> citationBlock <|> - (snd <$> withRaw comment) <|> - headerBlock <|> lineClump) eof + let chunk = referenceKey + <|> anchorDef + <|> noteBlock + <|> citationBlock + <|> (snd <$> withRaw comment) + <|> headerBlock + <|> lineClump + docMinusKeys <- Sources <$> + manyTill (do pos <- getPosition + t <- chunk + return (pos, t)) eof + -- UGLY: we collapse source position information. + -- TODO: fix the parser to use the F monad instead of two passes setInput docMinusKeys setPosition startPos st' <- getState @@ -348,7 +355,7 @@ singleHeader' = try $ do -- hrule block -- -hrule :: Monad m => ParserT Text st m Blocks +hrule :: Monad m => ParserT Sources st m Blocks hrule = try $ do chr <- oneOf underlineChars count 3 (char chr) @@ -363,7 +370,7 @@ hrule = try $ do -- read a line indented by a given string indentedLine :: (HasReaderOptions st, Monad m) - => Int -> ParserT Text st m Text + => Int -> ParserT Sources st m Text indentedLine indents = try $ do lookAhead spaceChar gobbleAtMostSpaces indents @@ -372,7 +379,7 @@ indentedLine indents = try $ do -- one or more indented lines, possibly separated by blank lines. -- any amount of indentation will work. indentedBlock :: (HasReaderOptions st, Monad m) - => ParserT Text st m Text + => ParserT Sources st m Text indentedBlock = try $ do indents <- length <$> lookAhead (many1 spaceChar) lns <- many1 $ try $ do b <- option "" blanklines @@ -381,20 +388,20 @@ indentedBlock = try $ do optional blanklines return $ T.unlines lns -quotedBlock :: Monad m => ParserT Text st m Text +quotedBlock :: Monad m => ParserT Sources st m Text quotedBlock = try $ do quote <- lookAhead $ oneOf "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" lns <- many1 $ lookAhead (char quote) >> anyLine optional blanklines return $ T.unlines lns -codeBlockStart :: Monad m => ParserT Text st m Char +codeBlockStart :: Monad m => ParserT Sources st m Char codeBlockStart = string "::" >> blankline >> blankline -codeBlock :: Monad m => ParserT Text ParserState m Blocks +codeBlock :: Monad m => ParserT Sources ParserState m Blocks codeBlock = try $ codeBlockStart >> codeBlockBody -codeBlockBody :: Monad m => ParserT Text ParserState m Blocks +codeBlockBody :: Monad m => ParserT Sources ParserState m Blocks codeBlockBody = do lang <- stateRstHighlight <$> getState try $ B.codeBlockWith ("", maybeToList lang, []) . stripTrailingNewlines <$> @@ -410,14 +417,14 @@ lhsCodeBlock = try $ do return $ B.codeBlockWith ("", ["haskell","literate"], []) $ T.intercalate "\n" lns -latexCodeBlock :: Monad m => ParserT Text st m [Text] +latexCodeBlock :: Monad m => ParserT Sources st m [Text] latexCodeBlock = try $ do try (latexBlockLine "\\begin{code}") many1Till anyLine (try $ latexBlockLine "\\end{code}") where latexBlockLine s = skipMany spaceChar >> string s >> blankline -birdCodeBlock :: Monad m => ParserT Text st m [Text] +birdCodeBlock :: Monad m => ParserT Sources st m [Text] birdCodeBlock = filterSpace <$> many1 birdTrackLine where filterSpace lns = -- if (as is normal) there is always a space after >, drop it @@ -425,7 +432,7 @@ birdCodeBlock = filterSpace <$> many1 birdTrackLine then map (T.drop 1) lns else lns -birdTrackLine :: Monad m => ParserT Text st m Text +birdTrackLine :: Monad m => ParserT Sources st m Text birdTrackLine = char '>' >> anyLine -- @@ -446,64 +453,43 @@ encoding -} includeDirective :: PandocMonad m - => Text -> [(Text, Text)] -> Text + => Text + -> [(Text, Text)] + -> Text -> RSTParser m Blocks includeDirective top fields body = do - let f = trim top - guard $ not (T.null f) + let f = T.unpack $ trim top + guard $ not $ null f guard $ T.null (trim body) - -- options - let (startLine :: Maybe Int) = lookup "start-line" fields >>= safeRead - let (endLine :: Maybe Int) = lookup "end-line" fields >>= safeRead - oldPos <- getPosition - oldInput <- getInput - containers <- stateContainers <$> getState - when (f `elem` containers) $ - throwError $ PandocParseError $ "Include file loop at " <> tshow oldPos - updateState $ \s -> s{ stateContainers = f : stateContainers s } - mbContents <- readFileFromDirs ["."] $ T.unpack f - contentLines <- case mbContents of - Just s -> return $ T.lines s - Nothing -> do - logMessage $ CouldNotLoadIncludeFile f oldPos - return [] - let numLines = length contentLines - let startLine' = case startLine of - Nothing -> 1 - Just x | x >= 0 -> x - | otherwise -> numLines + x -- negative from end - let endLine' = case endLine of - Nothing -> numLines + 1 - Just x | x >= 0 -> x - | otherwise -> numLines + x -- negative from end - let contentLines' = drop (startLine' - 1) - $ take (endLine' - 1) contentLines - let contentLines'' = (case trim <$> lookup "end-before" fields of - Just patt -> takeWhile (not . (patt `T.isInfixOf`)) - Nothing -> id) . - (case trim <$> lookup "start-after" fields of - Just patt -> drop 1 . - dropWhile (not . (patt `T.isInfixOf`)) - Nothing -> id) $ contentLines' - let contents' = T.unlines contentLines'' - case lookup "code" fields of - Just lang -> do - let classes = maybe [] T.words (lookup "class" fields) - let ident = maybe "" trimr $ lookup "name" fields - codeblock ident classes fields (trimr lang) contents' False - Nothing -> case lookup "literal" fields of - Just _ -> return $ B.rawBlock "rst" contents' - Nothing -> do - setPosition $ newPos (T.unpack f) 1 1 - setInput $ contents' <> "\n" - bs <- optional blanklines >> - (mconcat <$> many block) - setInput oldInput - setPosition oldPos - updateState $ \s -> s{ stateContainers = - tail $ stateContainers s } - return bs - + let startLine = lookup "start-line" fields >>= safeRead + let endLine = lookup "end-line" fields >>= safeRead + let classes = maybe [] T.words (lookup "class" fields) + let ident = maybe "" trimr $ lookup "name" fields + let parser = + case lookup "code" fields of + Just lang -> + (codeblock ident classes fields (trimr lang) False + . sourcesToText) <$> getInput + Nothing -> + case lookup "literal" fields of + Just _ -> B.rawBlock "rst" . sourcesToText <$> getInput + Nothing -> parseBlocks + let isLiteral = isJust (lookup "code" fields `mplus` lookup "literal" fields) + let selectLines = + (case trim <$> lookup "end-before" fields of + Just patt -> takeWhile (not . (patt `T.isInfixOf`)) + Nothing -> id) . + (case trim <$> lookup "start-after" fields of + Just patt -> drop 1 . + dropWhile (not . (patt `T.isInfixOf`)) + Nothing -> id) + + let toStream t = + Sources [(initialPos f, + (T.unlines . selectLines . T.lines $ t) <> + if isLiteral then mempty else "\n")] -- see #7436 + currentDir <- takeDirectory . sourceName <$> getPosition + insertIncludedFile parser toStream [currentDir] f startLine endLine -- -- list blocks @@ -526,7 +512,7 @@ definitionList :: PandocMonad m => RSTParser m Blocks definitionList = B.definitionList <$> many1 definitionListItem -- parses bullet list start and returns its length (inc. following whitespace) -bulletListStart :: Monad m => ParserT Text st m Int +bulletListStart :: Monad m => ParserT Sources st m Int bulletListStart = try $ do notFollowedBy' hrule -- because hrules start out just like lists marker <- oneOf bulletListMarkers @@ -613,8 +599,9 @@ comment = try $ do string ".." skipMany1 spaceChar <|> (() <$ lookAhead newline) -- notFollowedBy' directiveLabel -- comment comes after directive so unnec. - manyTill anyChar blanklines + _ <- anyLine optional indentedBlock + optional blanklines return mempty directiveLabel :: Monad m => RSTParser m Text @@ -685,7 +672,7 @@ directive' = do "replace" -> B.para <$> -- consumed by substKey parseInlineFromText (trim top) "date" -> B.para <$> do -- consumed by substKey - t <- getCurrentTime + t <- getTimestamp let format = case T.unpack (T.strip top) of [] -> "%Y-%m-%d" x -> x @@ -731,8 +718,8 @@ directive' = do "" -> stateRstHighlight def lang -> Just lang }) x | x == "code" || x == "code-block" || x == "sourcecode" -> - codeblock name classes (map (second trimr) fields) - (trim top) body True + return $ codeblock name classes (map (second trimr) fields) + (trim top) True body "aafig" -> do let attribs = (name, ["aafig"], map (second trimr) fields) return $ B.codeBlockWith attribs $ stripTrailingNewlines body @@ -877,10 +864,11 @@ csvTableDirective top fields rawcsv = do (bs, _) <- fetchItem u return $ UTF8.toText bs Nothing -> return rawcsv - let res = parseCSV opts (case explicitHeader of - Just h -> h <> "\n" <> rawcsv' - Nothing -> rawcsv') - case res of + let header' = case explicitHeader of + Just h -> parseCSV defaultCSVOptions h + Nothing -> Right [] + let res = parseCSV opts rawcsv' + case (<>) <$> header' <*> res of Left e -> throwError $ PandocParsecError "csv table" e Right rawrows -> do @@ -1017,10 +1005,10 @@ toChunks = dropWhile T.null then "\\begin{aligned}\n" <> s <> "\n\\end{aligned}" else s -codeblock :: Text -> [Text] -> [(Text, Text)] -> Text -> Text -> Bool - -> RSTParser m Blocks -codeblock ident classes fields lang body rmTrailingNewlines = - return $ B.codeBlockWith attribs $ stripTrailingNewlines' body +codeblock :: Text -> [Text] -> [(Text, Text)] -> Text -> Bool -> Text + -> Blocks +codeblock ident classes fields lang rmTrailingNewlines body = + B.codeBlockWith attribs $ stripTrailingNewlines' body where stripTrailingNewlines' = if rmTrailingNewlines then stripTrailingNewlines else id @@ -1101,7 +1089,7 @@ quotedReferenceName = try $ do -- plus isolated (no two adjacent) internal hyphens, underscores, -- periods, colons and plus signs; no whitespace or other characters -- are allowed. -simpleReferenceName :: Monad m => ParserT Text st m Text +simpleReferenceName :: Monad m => ParserT Sources st m Text simpleReferenceName = do x <- alphaNum xs <- many $ alphaNum @@ -1120,7 +1108,7 @@ referenceKey = do -- return enough blanks to replace key return $ T.replicate (sourceLine endPos - sourceLine startPos) "\n" -targetURI :: Monad m => ParserT Text st m Text +targetURI :: Monad m => ParserT Sources st m Text targetURI = do skipSpaces optional $ try $ newline >> notFollowedBy blankline @@ -1158,8 +1146,10 @@ anonymousKey :: Monad m => RSTParser m () anonymousKey = try $ do oneOfStrings [".. __:", "__"] src <- targetURI - pos <- getPosition - let key = toKey $ "_" <> T.pack (printf "%09d" (sourceLine pos)) + -- we need to ensure that the keys are ordered by occurrence in + -- the document. + numKeys <- M.size . stateKeys <$> getState + let key = toKey $ "_" <> T.pack (show numKeys) updateState $ \s -> s { stateKeys = M.insert key ((src,""), nullAttr) $ stateKeys s } @@ -1248,13 +1238,13 @@ headerBlock = do -- Grid tables TODO: -- - column spans -dashedLine :: Monad m => Char -> ParserT Text st m (Int, Int) +dashedLine :: Monad m => Char -> ParserT Sources st m (Int, Int) dashedLine ch = do dashes <- many1 (char ch) sp <- many (char ' ') return (length dashes, length $ dashes ++ sp) -simpleDashedLines :: Monad m => Char -> ParserT Text st m [(Int,Int)] +simpleDashedLines :: Monad m => Char -> ParserT Sources st m [(Int,Int)] simpleDashedLines ch = try $ many1 (dashedLine ch) -- Parse a table row separator @@ -1380,7 +1370,7 @@ hyphens = do -- don't want to treat endline after hyphen or dash as a space return $ B.str result -escapedChar :: Monad m => ParserT Text st m Inlines +escapedChar :: Monad m => ParserT Sources st m Inlines escapedChar = do c <- escaped anyChar return $ if c == ' ' || c == '\n' || c == '\r' -- '\ ' is null in RST @@ -1656,21 +1646,4 @@ note = try $ do return $ B.note contents smart :: PandocMonad m => RSTParser m Inlines -smart = do - guardEnabled Ext_smart - doubleQuoted <|> singleQuoted <|> - choice [apostrophe, dash, ellipses] - -singleQuoted :: PandocMonad m => RSTParser m Inlines -singleQuoted = try $ do - singleQuoteStart - withQuoteContext InSingleQuote $ - B.singleQuoted . trimInlines . mconcat <$> - many1Till inline singleQuoteEnd - -doubleQuoted :: PandocMonad m => RSTParser m Inlines -doubleQuoted = try $ do - doubleQuoteStart - withQuoteContext InDoubleQuote $ - B.doubleQuoted . trimInlines . mconcat <$> - many1Till inline doubleQuoteEnd +smart = smartPunctuation inline diff --git a/src/Text/Pandoc/Readers/Roff.hs b/src/Text/Pandoc/Readers/Roff.hs index 509ce1377..47f16ef4b 100644 --- a/src/Text/Pandoc/Readers/Roff.hs +++ b/src/Text/Pandoc/Readers/Roff.hs @@ -42,7 +42,6 @@ import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Shared (safeRead) -import Text.Parsec hiding (tokenPrim) import Text.Pandoc.RoffChar (characterCodes, combiningAccents) import qualified Data.Sequence as Seq import qualified Data.Foldable as Foldable @@ -122,16 +121,16 @@ instance Default RoffState where , afterConditional = False } -type RoffLexer m = ParserT T.Text RoffState m +type RoffLexer m = ParserT Sources RoffState m -- -- Lexer: T.Text -> RoffToken -- -eofline :: Stream s m Char => ParsecT s u m () +eofline :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s u m () eofline = void newline <|> eof <|> () <$ lookAhead (string "\\}") -spacetab :: Stream s m Char => ParsecT s u m Char +spacetab :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s u m Char spacetab = char ' ' <|> char '\t' characterCodeMap :: M.Map T.Text Char @@ -303,8 +302,7 @@ expandString = try $ do char '*' cs <- escapeArg <|> countChar 1 anyChar s <- linePartsToText <$> resolveText cs pos - getInput >>= setInput . (s <>) - return () + addToInput s -- Parses: '..' quoteArg :: PandocMonad m => RoffLexer m T.Text @@ -316,7 +314,7 @@ escFont = do font' <- if T.null font || font == "P" then prevFont <$> getState else return $ foldr processFontLetter defaultFontSpec $ T.unpack font - modifyState $ \st -> st{ prevFont = currentFont st + updateState $ \st -> st{ prevFont = currentFont st , currentFont = font' } return [Font font'] where @@ -372,8 +370,8 @@ lexTable pos = do spaces opts <- try tableOptions <|> [] <$ optional (char ';') case lookup "tab" opts of - Just (T.uncons -> Just (c, _)) -> modifyState $ \st -> st{ tableTabChar = c } - _ -> modifyState $ \st -> st{ tableTabChar = '\t' } + Just (T.uncons -> Just (c, _)) -> updateState $ \st -> st{ tableTabChar = c } + _ -> updateState $ \st -> st{ tableTabChar = '\t' } spaces skipMany lexComment spaces @@ -489,18 +487,18 @@ lexConditional mname = do ifPart <- do optional $ try $ char '\\' >> newline lexGroup - <|> do modifyState $ \s -> s{ afterConditional = True } + <|> do updateState $ \s -> s{ afterConditional = True } t <- manToken - modifyState $ \s -> s{ afterConditional = False } + updateState $ \s -> s{ afterConditional = False } return t case mbtest of Nothing -> do - putState st -- reset state, so we don't record macros in skipped section + setState st -- reset state, so we don't record macros in skipped section report $ SkippedContent (T.cons '.' mname) pos return mempty Just True -> return ifPart Just False -> do - putState st + setState st return mempty expression :: PandocMonad m => RoffLexer m (Maybe Bool) @@ -515,7 +513,7 @@ expression = do _ -> Nothing where returnValue v = do - modifyState $ \st -> st{ lastExpression = v } + updateState $ \st -> st{ lastExpression = v } return v lexGroup :: PandocMonad m => RoffLexer m RoffTokens @@ -536,7 +534,7 @@ lexIncludeFile args = do result <- readFileFromDirs dirs $ T.unpack fp case result of Nothing -> report $ CouldNotLoadIncludeFile fp pos - Just s -> getInput >>= setInput . (s <>) + Just s -> addToInput s return mempty [] -> return mempty @@ -564,13 +562,13 @@ lexStringDef args = do -- string definition (x:ys) -> do let ts = singleTok $ TextLine (intercalate [RoffStr " " ] ys) let stringName = linePartsToText x - modifyState $ \st -> + updateState $ \st -> st{ customMacros = M.insert stringName ts (customMacros st) } return mempty lexMacroDef :: PandocMonad m => [Arg] -> RoffLexer m RoffTokens lexMacroDef args = do -- macro definition - modifyState $ \st -> st{ roffMode = CopyMode } + updateState $ \st -> st{ roffMode = CopyMode } (macroName, stopMacro) <- case args of (x : y : _) -> return (linePartsToText x, linePartsToText y) @@ -584,7 +582,7 @@ lexMacroDef args = do -- macro definition _ <- lexArgs return () ts <- mconcat <$> manyTill manToken stop - modifyState $ \st -> + updateState $ \st -> st{ customMacros = M.insert macroName ts (customMacros st) , roffMode = NormalMode } return mempty diff --git a/src/Text/Pandoc/Readers/TWiki.hs b/src/Text/Pandoc/Readers/TWiki.hs index 484a6c923..276d28aaa 100644 --- a/src/Text/Pandoc/Readers/TWiki.hs +++ b/src/Text/Pandoc/Readers/TWiki.hs @@ -28,22 +28,22 @@ import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (enclosed, nested) import Text.Pandoc.Readers.HTML (htmlTag, isCommentTag) -import Text.Pandoc.Shared (crFilter, tshow) +import Text.Pandoc.Shared (tshow) import Text.Pandoc.XML (fromEntities) -- | Read twiki from an input string and return a Pandoc document. -readTWiki :: PandocMonad m +readTWiki :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc readTWiki opts s = do - res <- readWithM parseTWiki def{ stateOptions = opts } - (crFilter s <> "\n\n") + let sources = ensureFinalNewlines 2 (toSources s) + res <- readWithM parseTWiki def{ stateOptions = opts } sources case res of Left e -> throwError e Right d -> return d -type TWParser = ParserT Text ParserState +type TWParser = ParserT Sources ParserState -- -- utility functions @@ -469,27 +469,7 @@ symbol :: PandocMonad m => TWParser m B.Inlines symbol = B.str <$> countChar 1 nonspaceChar smart :: PandocMonad m => TWParser m B.Inlines -smart = do - guardEnabled Ext_smart - doubleQuoted <|> singleQuoted <|> - choice [ apostrophe - , dash - , ellipses - ] - -singleQuoted :: PandocMonad m => TWParser m B.Inlines -singleQuoted = try $ do - singleQuoteStart - withQuoteContext InSingleQuote - (B.singleQuoted . B.trimInlines . mconcat <$> many1Till inline singleQuoteEnd) - -doubleQuoted :: PandocMonad m => TWParser m B.Inlines -doubleQuoted = try $ do - doubleQuoteStart - contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) - withQuoteContext InDoubleQuote (doubleQuoteEnd >> - return (B.doubleQuoted $ B.trimInlines contents)) - <|> return (B.str "\8220" B.<> contents) +smart = smartPunctuation inline link :: PandocMonad m => TWParser m B.Inlines link = try $ do diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index 6691d8381..981878206 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -3,7 +3,7 @@ {- | Module : Text.Pandoc.Readers.Textile Copyright : Copyright (C) 2010-2012 Paul Rivier - 2010-2020 John MacFarlane + 2010-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : Paul Rivier <paul*rivier#demotera*com> @@ -11,7 +11,7 @@ Portability : portable Conversion from Textile to 'Pandoc' document, based on the spec -available at http://redcloth.org/textile. +available at https://www.promptworks.com/textile/. Implemented and parsed: - Paragraphs @@ -38,7 +38,8 @@ module Text.Pandoc.Readers.Textile ( readTextile) where import Control.Monad (guard, liftM) import Control.Monad.Except (throwError) import Data.Char (digitToInt, isUpper) -import Data.List (intersperse, transpose) +import Data.List (intersperse, transpose, foldl') +import Data.List.NonEmpty (NonEmpty(..), nonEmpty) import Data.Text (Text) import qualified Data.Text as T import Text.HTML.TagSoup (Tag (..), fromAttrib) @@ -52,30 +53,34 @@ import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Readers.HTML (htmlTag, isBlockTag, isInlineTag) import Text.Pandoc.Readers.LaTeX (rawLaTeXBlock, rawLaTeXInline) -import Text.Pandoc.Shared (crFilter, trim, tshow) +import Text.Pandoc.Shared (trim, tshow) -- | Parse a Textile text and return a Pandoc document. -readTextile :: PandocMonad m +readTextile :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assuming @'\n'@ line endings) + -> a -> m Pandoc readTextile opts s = do - parsed <- readWithM parseTextile def{ stateOptions = opts } - (crFilter s <> "\n\n") + let sources = ensureFinalNewlines 2 (toSources s) + parsed <- readWithM parseTextile def{ stateOptions = opts } sources case parsed of Right result -> return result Left e -> throwError e +type TextileParser = ParserT Sources ParserState -- | Generate a Pandoc ADT from a textile document -parseTextile :: PandocMonad m => ParserT Text ParserState m Pandoc +parseTextile :: PandocMonad m => TextileParser m Pandoc parseTextile = do many blankline startPos <- getPosition -- go through once just to get list of reference keys and notes -- docMinusKeys is the raw document with blanks where the keys/notes were... - let firstPassParser = noteBlock <|> lineClump - manyTill firstPassParser eof >>= setInput . T.concat + let firstPassParser = do + pos <- getPosition + t <- noteBlock <|> lineClump + return (pos, t) + manyTill firstPassParser eof >>= setInput . Sources setPosition startPos st' <- getState let reversedNotes = stateNotes st' @@ -83,10 +88,10 @@ parseTextile = do -- now parse it for real... Pandoc nullMeta . B.toList <$> parseBlocks -- FIXME -noteMarker :: PandocMonad m => ParserT Text ParserState m Text +noteMarker :: PandocMonad m => TextileParser m Text noteMarker = skipMany spaceChar >> string "fn" >> T.pack <$> manyTill digit (char '.') -noteBlock :: PandocMonad m => ParserT Text ParserState m Text +noteBlock :: PandocMonad m => TextileParser m Text noteBlock = try $ do startPos <- getPosition ref <- noteMarker @@ -101,11 +106,11 @@ noteBlock = try $ do return $ T.replicate (sourceLine endPos - sourceLine startPos) "\n" -- | Parse document blocks -parseBlocks :: PandocMonad m => ParserT Text ParserState m Blocks +parseBlocks :: PandocMonad m => TextileParser m Blocks parseBlocks = mconcat <$> manyTill block eof -- | Block parsers list tried in definition order -blockParsers :: PandocMonad m => [ParserT Text ParserState m Blocks] +blockParsers :: PandocMonad m => [TextileParser m Blocks] blockParsers = [ codeBlock , header , blockQuote @@ -120,22 +125,22 @@ blockParsers = [ codeBlock ] -- | Any block in the order of definition of blockParsers -block :: PandocMonad m => ParserT Text ParserState m Blocks +block :: PandocMonad m => TextileParser m Blocks block = do res <- choice blockParsers <?> "block" trace (T.take 60 $ tshow $ B.toList res) return res -commentBlock :: PandocMonad m => ParserT Text ParserState m Blocks +commentBlock :: PandocMonad m => TextileParser m Blocks commentBlock = try $ do string "###." manyTill anyLine blanklines return mempty -codeBlock :: PandocMonad m => ParserT Text ParserState m Blocks +codeBlock :: PandocMonad m => TextileParser m Blocks codeBlock = codeBlockTextile <|> codeBlockHtml -codeBlockTextile :: PandocMonad m => ParserT Text ParserState m Blocks +codeBlockTextile :: PandocMonad m => TextileParser m Blocks codeBlockTextile = try $ do string "bc." <|> string "pre." extended <- option False (True <$ char '.') @@ -155,7 +160,7 @@ trimTrailingNewlines :: Text -> Text trimTrailingNewlines = T.dropWhileEnd (=='\n') -- | Code Blocks in Textile are between <pre> and </pre> -codeBlockHtml :: PandocMonad m => ParserT Text ParserState m Blocks +codeBlockHtml :: PandocMonad m => TextileParser m Blocks codeBlockHtml = try $ do (t@(TagOpen _ attrs),_) <- htmlTag (tagOpen (=="pre") (const True)) result' <- T.pack <$> manyTill anyChar (htmlTag (tagClose (=="pre"))) @@ -173,7 +178,7 @@ codeBlockHtml = try $ do return $ B.codeBlockWith (ident,classes,kvs) result''' -- | Header of the form "hN. content" with N in 1..6 -header :: PandocMonad m => ParserT Text ParserState m Blocks +header :: PandocMonad m => TextileParser m Blocks header = try $ do char 'h' level <- digitToInt <$> oneOf "123456" @@ -185,14 +190,14 @@ header = try $ do return $ B.headerWith attr' level name -- | Blockquote of the form "bq. content" -blockQuote :: PandocMonad m => ParserT Text ParserState m Blocks +blockQuote :: PandocMonad m => TextileParser m Blocks blockQuote = try $ do string "bq" >> attributes >> char '.' >> whitespace B.blockQuote <$> para -- Horizontal rule -hrule :: PandocMonad m => ParserT Text st m Blocks +hrule :: PandocMonad m => TextileParser m Blocks hrule = try $ do skipSpaces start <- oneOf "-*" @@ -207,39 +212,39 @@ hrule = try $ do -- | Can be a bullet list or an ordered list. This implementation is -- strict in the nesting, sublist must start at exactly "parent depth -- plus one" -anyList :: PandocMonad m => ParserT Text ParserState m Blocks +anyList :: PandocMonad m => TextileParser m Blocks anyList = try $ anyListAtDepth 1 <* blanklines -- | This allow one type of list to be nested into an other type, -- provided correct nesting -anyListAtDepth :: PandocMonad m => Int -> ParserT Text ParserState m Blocks +anyListAtDepth :: PandocMonad m => Int -> TextileParser m Blocks anyListAtDepth depth = choice [ bulletListAtDepth depth, orderedListAtDepth depth, definitionList ] -- | Bullet List of given depth, depth being the number of leading '*' -bulletListAtDepth :: PandocMonad m => Int -> ParserT Text ParserState m Blocks +bulletListAtDepth :: PandocMonad m => Int -> TextileParser m Blocks bulletListAtDepth depth = try $ B.bulletList <$> many1 (bulletListItemAtDepth depth) -- | Bullet List Item of given depth, depth being the number of -- leading '*' -bulletListItemAtDepth :: PandocMonad m => Int -> ParserT Text ParserState m Blocks +bulletListItemAtDepth :: PandocMonad m => Int -> TextileParser m Blocks bulletListItemAtDepth = genericListItemAtDepth '*' -- | Ordered List of given depth, depth being the number of -- leading '#' -orderedListAtDepth :: PandocMonad m => Int -> ParserT Text ParserState m Blocks +orderedListAtDepth :: PandocMonad m => Int -> TextileParser m Blocks orderedListAtDepth depth = try $ do items <- many1 (orderedListItemAtDepth depth) return $ B.orderedList items -- | Ordered List Item of given depth, depth being the number of -- leading '#' -orderedListItemAtDepth :: PandocMonad m => Int -> ParserT Text ParserState m Blocks +orderedListItemAtDepth :: PandocMonad m => Int -> TextileParser m Blocks orderedListItemAtDepth = genericListItemAtDepth '#' -- | Common implementation of list items -genericListItemAtDepth :: PandocMonad m => Char -> Int -> ParserT Text ParserState m Blocks +genericListItemAtDepth :: PandocMonad m => Char -> Int -> TextileParser m Blocks genericListItemAtDepth c depth = try $ do count depth (char c) >> attributes >> whitespace contents <- mconcat <$> many ((B.plain . mconcat <$> many1 inline) <|> @@ -249,25 +254,25 @@ genericListItemAtDepth c depth = try $ do return $ contents <> sublist -- | A definition list is a set of consecutive definition items -definitionList :: PandocMonad m => ParserT Text ParserState m Blocks +definitionList :: PandocMonad m => TextileParser m Blocks definitionList = try $ B.definitionList <$> many1 definitionListItem -- | List start character. -listStart :: PandocMonad m => ParserT Text ParserState m () +listStart :: PandocMonad m => TextileParser m () listStart = genericListStart '*' <|> () <$ genericListStart '#' <|> () <$ definitionListStart -genericListStart :: PandocMonad m => Char -> ParserT Text st m () +genericListStart :: PandocMonad m => Char -> TextileParser m () genericListStart c = () <$ try (many1 (char c) >> whitespace) -basicDLStart :: PandocMonad m => ParserT Text ParserState m () +basicDLStart :: PandocMonad m => TextileParser m () basicDLStart = do char '-' whitespace notFollowedBy newline -definitionListStart :: PandocMonad m => ParserT Text ParserState m Inlines +definitionListStart :: PandocMonad m => TextileParser m Inlines definitionListStart = try $ do basicDLStart trimInlines . mconcat <$> @@ -280,15 +285,15 @@ definitionListStart = try $ do -- the term defined, then spaces and ":=". The definition follows, on -- the same single line, or spaned on multiple line, after a line -- break. -definitionListItem :: PandocMonad m => ParserT Text ParserState m (Inlines, [Blocks]) +definitionListItem :: PandocMonad m => TextileParser m (Inlines, [Blocks]) definitionListItem = try $ do term <- mconcat . intersperse B.linebreak <$> many1 definitionListStart def' <- string ":=" *> optional whitespace *> (multilineDef <|> inlineDef) return (term, def') - where inlineDef :: PandocMonad m => ParserT Text ParserState m [Blocks] + where inlineDef :: PandocMonad m => TextileParser m [Blocks] inlineDef = liftM (\d -> [B.plain d]) $ optional whitespace >> (trimInlines . mconcat <$> many inline) <* newline - multilineDef :: PandocMonad m => ParserT Text ParserState m [Blocks] + multilineDef :: PandocMonad m => TextileParser m [Blocks] multilineDef = try $ do optional whitespace >> newline s <- T.pack <$> many1Till anyChar (try (string "=:" >> newline)) @@ -299,7 +304,7 @@ definitionListItem = try $ do -- raw content -- | A raw Html Block, optionally followed by blanklines -rawHtmlBlock :: PandocMonad m => ParserT Text ParserState m Blocks +rawHtmlBlock :: PandocMonad m => TextileParser m Blocks rawHtmlBlock = try $ do skipMany spaceChar (_,b) <- htmlTag isBlockTag @@ -307,14 +312,14 @@ rawHtmlBlock = try $ do return $ B.rawBlock "html" b -- | Raw block of LaTeX content -rawLaTeXBlock' :: PandocMonad m => ParserT Text ParserState m Blocks +rawLaTeXBlock' :: PandocMonad m => TextileParser m Blocks rawLaTeXBlock' = do guardEnabled Ext_raw_tex B.rawBlock "latex" <$> (rawLaTeXBlock <* spaces) -- | In textile, paragraphs are separated by blank lines. -para :: PandocMonad m => ParserT Text ParserState m Blocks +para :: PandocMonad m => TextileParser m Blocks para = B.para . trimInlines . mconcat <$> many1 inline -- Tables @@ -325,7 +330,7 @@ toAlignment '>' = AlignRight toAlignment '=' = AlignCenter toAlignment _ = AlignDefault -cellAttributes :: PandocMonad m => ParserT Text ParserState m (Bool, Alignment) +cellAttributes :: PandocMonad m => TextileParser m (Bool, Alignment) cellAttributes = try $ do isHeader <- option False (True <$ char '_') -- we just ignore colspan and rowspan markers: @@ -338,7 +343,7 @@ cellAttributes = try $ do return (isHeader, alignment) -- | A table cell spans until a pipe | -tableCell :: PandocMonad m => ParserT Text ParserState m ((Bool, Alignment), Blocks) +tableCell :: PandocMonad m => TextileParser m ((Bool, Alignment), Blocks) tableCell = try $ do char '|' (isHeader, alignment) <- option (False, AlignDefault) cellAttributes @@ -349,7 +354,7 @@ tableCell = try $ do return ((isHeader, alignment), B.plain content) -- | A table row is made of many table cells -tableRow :: PandocMonad m => ParserT Text ParserState m [((Bool, Alignment), Blocks)] +tableRow :: PandocMonad m => TextileParser m [((Bool, Alignment), Blocks)] tableRow = try $ do -- skip optional row attributes optional $ try $ do @@ -359,7 +364,7 @@ tableRow = try $ do many1 tableCell <* char '|' <* blankline -- | A table with an optional header. -table :: PandocMonad m => ParserT Text ParserState m Blocks +table :: PandocMonad m => TextileParser m Blocks table = try $ do -- ignore table attributes caption <- option mempty $ try $ do @@ -375,8 +380,9 @@ table = try $ do (toprow:rest) | any (fst . fst) toprow -> (toprow, rest) _ -> (mempty, rawrows) - let nbOfCols = maximum $ map length (headers:rows) - let aligns = map minimum $ transpose $ map (map (snd . fst)) (headers:rows) + let nbOfCols = maximum $ fmap length (headers :| rows) + let aligns = map (maybe AlignDefault minimum . nonEmpty) $ + transpose $ map (map (snd . fst)) (headers:rows) let toRow = Row nullAttr . map B.simpleCell toHeaderRow l = [toRow l | not (null l)] return $ B.table (B.simpleCaption $ B.plain caption) @@ -386,7 +392,7 @@ table = try $ do (TableFoot nullAttr []) -- | Ignore markers for cols, thead, tfoot. -ignorableRow :: PandocMonad m => ParserT Text ParserState m () +ignorableRow :: PandocMonad m => TextileParser m () ignorableRow = try $ do char '|' oneOf ":^-~" @@ -395,7 +401,7 @@ ignorableRow = try $ do _ <- anyLine return () -explicitBlockStart :: PandocMonad m => Text -> ParserT Text ParserState m () +explicitBlockStart :: PandocMonad m => Text -> TextileParser m () explicitBlockStart name = try $ do string (T.unpack name) attributes @@ -407,8 +413,8 @@ explicitBlockStart name = try $ do -- However, they can be used to set HTML/CSS attributes when needed. maybeExplicitBlock :: PandocMonad m => Text -- ^ block tag name - -> ParserT Text ParserState m Blocks -- ^ implicit block - -> ParserT Text ParserState m Blocks + -> TextileParser m Blocks -- ^ implicit block + -> TextileParser m Blocks maybeExplicitBlock name blk = try $ do optional $ explicitBlockStart name blk @@ -421,11 +427,11 @@ maybeExplicitBlock name blk = try $ do -- | Any inline element -inline :: PandocMonad m => ParserT Text ParserState m Inlines +inline :: PandocMonad m => TextileParser m Inlines inline = choice inlineParsers <?> "inline" -- | Inline parsers tried in order -inlineParsers :: PandocMonad m => [ParserT Text ParserState m Inlines] +inlineParsers :: PandocMonad m => [TextileParser m Inlines] inlineParsers = [ str , whitespace , endline @@ -445,7 +451,7 @@ inlineParsers = [ str ] -- | Inline markups -inlineMarkup :: PandocMonad m => ParserT Text ParserState m Inlines +inlineMarkup :: PandocMonad m => TextileParser m Inlines inlineMarkup = choice [ simpleInline (string "??") (B.cite []) , simpleInline (string "**") B.strong , simpleInline (string "__") B.emph @@ -459,29 +465,29 @@ inlineMarkup = choice [ simpleInline (string "??") (B.cite []) ] -- | Trademark, registered, copyright -mark :: PandocMonad m => ParserT Text st m Inlines +mark :: PandocMonad m => TextileParser m Inlines mark = try $ char '(' >> (try tm <|> try reg <|> copy) -reg :: PandocMonad m => ParserT Text st m Inlines +reg :: PandocMonad m => TextileParser m Inlines reg = do oneOf "Rr" char ')' return $ B.str "\174" -tm :: PandocMonad m => ParserT Text st m Inlines +tm :: PandocMonad m => TextileParser m Inlines tm = do oneOf "Tt" oneOf "Mm" char ')' return $ B.str "\8482" -copy :: PandocMonad m => ParserT Text st m Inlines +copy :: PandocMonad m => TextileParser m Inlines copy = do oneOf "Cc" char ')' return $ B.str "\169" -note :: PandocMonad m => ParserT Text ParserState m Inlines +note :: PandocMonad m => TextileParser m Inlines note = try $ do ref <- char '[' *> many1 digit <* char ']' notes <- stateNotes <$> getState @@ -505,13 +511,13 @@ wordBoundaries :: [Char] wordBoundaries = markupChars <> stringBreakers -- | Parse a hyphened sequence of words -hyphenedWords :: PandocMonad m => ParserT Text ParserState m Text +hyphenedWords :: PandocMonad m => TextileParser m Text hyphenedWords = do x <- wordChunk xs <- many (try $ char '-' >> wordChunk) return $ T.intercalate "-" (x:xs) -wordChunk :: PandocMonad m => ParserT Text ParserState m Text +wordChunk :: PandocMonad m => TextileParser m Text wordChunk = try $ do hd <- noneOf wordBoundaries tl <- many ( noneOf wordBoundaries <|> @@ -520,7 +526,7 @@ wordChunk = try $ do return $ T.pack $ hd:tl -- | Any string -str :: PandocMonad m => ParserT Text ParserState m Inlines +str :: PandocMonad m => TextileParser m Inlines str = do baseStr <- hyphenedWords -- RedCloth compliance : if parsed word is uppercase and immediately @@ -533,11 +539,11 @@ str = do return $ B.str fullStr -- | Some number of space chars -whitespace :: PandocMonad m => ParserT Text st m Inlines +whitespace :: PandocMonad m => TextileParser m Inlines whitespace = many1 spaceChar >> return B.space <?> "whitespace" -- | In Textile, an isolated endline character is a line break -endline :: PandocMonad m => ParserT Text ParserState m Inlines +endline :: PandocMonad m => TextileParser m Inlines endline = try $ do newline notFollowedBy blankline @@ -545,18 +551,18 @@ endline = try $ do notFollowedBy rawHtmlBlock return B.linebreak -rawHtmlInline :: PandocMonad m => ParserT Text ParserState m Inlines +rawHtmlInline :: PandocMonad m => TextileParser m Inlines rawHtmlInline = B.rawInline "html" . snd <$> htmlTag isInlineTag -- | Raw LaTeX Inline -rawLaTeXInline' :: PandocMonad m => ParserT Text ParserState m Inlines +rawLaTeXInline' :: PandocMonad m => TextileParser m Inlines rawLaTeXInline' = try $ do guardEnabled Ext_raw_tex B.rawInline "latex" <$> rawLaTeXInline -- | Textile standard link syntax is "label":target. But we -- can also have ["label":target]. -link :: PandocMonad m => ParserT Text ParserState m Inlines +link :: PandocMonad m => TextileParser m Inlines link = try $ do bracketed <- (True <$ char '[') <|> return False char '"' *> notFollowedBy (oneOf " \t\n\r") @@ -576,7 +582,7 @@ link = try $ do else B.spanWith attr $ B.link url "" name' -- | image embedding -image :: PandocMonad m => ParserT Text ParserState m Inlines +image :: PandocMonad m => TextileParser m Inlines image = try $ do char '!' >> notFollowedBy space (ident, cls, kvs) <- attributes @@ -588,51 +594,51 @@ image = try $ do char '!' return $ B.imageWith attr src alt (B.str alt) -escapedInline :: PandocMonad m => ParserT Text ParserState m Inlines +escapedInline :: PandocMonad m => TextileParser m Inlines escapedInline = escapedEqs <|> escapedTag -escapedEqs :: PandocMonad m => ParserT Text ParserState m Inlines +escapedEqs :: PandocMonad m => TextileParser m Inlines escapedEqs = B.str . T.pack <$> try (string "==" *> manyTill anyChar' (try $ string "==")) -- | literal text escaped btw <notextile> tags -escapedTag :: PandocMonad m => ParserT Text ParserState m Inlines +escapedTag :: PandocMonad m => TextileParser m Inlines escapedTag = B.str . T.pack <$> try (string "<notextile>" *> manyTill anyChar' (try $ string "</notextile>")) -- | Any special symbol defined in wordBoundaries -symbol :: PandocMonad m => ParserT Text ParserState m Inlines +symbol :: PandocMonad m => TextileParser m Inlines symbol = B.str . T.singleton <$> (notFollowedBy newline *> notFollowedBy rawHtmlBlock *> oneOf wordBoundaries) -- | Inline code -code :: PandocMonad m => ParserT Text ParserState m Inlines +code :: PandocMonad m => TextileParser m Inlines code = code1 <|> code2 -- any character except a newline before a blank line -anyChar' :: PandocMonad m => ParserT Text ParserState m Char +anyChar' :: PandocMonad m => TextileParser m Char anyChar' = satisfy (/='\n') <|> try (char '\n' <* notFollowedBy blankline) -code1 :: PandocMonad m => ParserT Text ParserState m Inlines +code1 :: PandocMonad m => TextileParser m Inlines code1 = B.code . T.pack <$> surrounded (char '@') anyChar' -code2 :: PandocMonad m => ParserT Text ParserState m Inlines +code2 :: PandocMonad m => TextileParser m Inlines code2 = do htmlTag (tagOpen (=="tt") null) B.code . T.pack <$> manyTill anyChar' (try $ htmlTag $ tagClose (=="tt")) -- | Html / CSS attributes -attributes :: PandocMonad m => ParserT Text ParserState m Attr -attributes = foldl (flip ($)) ("",[],[]) <$> +attributes :: PandocMonad m => TextileParser m Attr +attributes = foldl' (flip ($)) ("",[],[]) <$> try (do special <- option id specialAttribute attrs <- many attribute return (special : attrs)) -specialAttribute :: PandocMonad m => ParserT Text ParserState m (Attr -> Attr) +specialAttribute :: PandocMonad m => TextileParser m (Attr -> Attr) specialAttribute = do alignStr <- ("center" <$ char '=') <|> ("justify" <$ try (string "<>")) <|> @@ -641,11 +647,11 @@ specialAttribute = do notFollowedBy spaceChar return $ addStyle $ T.pack $ "text-align:" ++ alignStr -attribute :: PandocMonad m => ParserT Text ParserState m (Attr -> Attr) +attribute :: PandocMonad m => TextileParser m (Attr -> Attr) attribute = try $ (classIdAttr <|> styleAttr <|> langAttr) <* notFollowedBy spaceChar -classIdAttr :: PandocMonad m => ParserT Text ParserState m (Attr -> Attr) +classIdAttr :: PandocMonad m => TextileParser m (Attr -> Attr) classIdAttr = try $ do -- (class class #id) char '(' ws <- T.words `fmap` T.pack <$> manyTill anyChar' (char ')') @@ -657,7 +663,7 @@ classIdAttr = try $ do -- (class class #id) classes' -> return $ \(_,_,keyvals) -> ("",classes',keyvals) -styleAttr :: PandocMonad m => ParserT Text ParserState m (Attr -> Attr) +styleAttr :: PandocMonad m => TextileParser m (Attr -> Attr) styleAttr = do style <- try $ enclosed (char '{') (char '}') anyChar' return $ addStyle $ T.pack style @@ -668,23 +674,23 @@ addStyle style (id',classes,keyvals) = where keyvals' = ("style", style') : [(k,v) | (k,v) <- keyvals, k /= "style"] style' = style <> ";" <> T.concat [v | ("style",v) <- keyvals] -langAttr :: PandocMonad m => ParserT Text ParserState m (Attr -> Attr) +langAttr :: PandocMonad m => TextileParser m (Attr -> Attr) langAttr = do lang <- try $ enclosed (char '[') (char ']') alphaNum return $ \(id',classes,keyvals) -> (id',classes,("lang",T.pack lang):keyvals) -- | Parses material surrounded by a parser. surrounded :: (PandocMonad m, Show t) - => ParserT Text st m t -- ^ surrounding parser - -> ParserT Text st m a -- ^ content parser (to be used repeatedly) - -> ParserT Text st m [a] + => ParserT Sources st m t -- ^ surrounding parser + -> ParserT Sources st m a -- ^ content parser (to be used repeatedly) + -> ParserT Sources st m [a] surrounded border = enclosed (border *> notFollowedBy (oneOf " \t\n\r")) (try border) simpleInline :: PandocMonad m - => ParserT Text ParserState m t -- ^ surrounding parser + => TextileParser m t -- ^ surrounding parser -> (Inlines -> Inlines) -- ^ Inline constructor - -> ParserT Text ParserState m Inlines -- ^ content parser (to be used repeatedly) + -> TextileParser m Inlines -- ^ content parser (to be used repeatedly) simpleInline border construct = try $ do notAfterString border *> notFollowedBy (oneOf " \t\n\r") @@ -698,7 +704,7 @@ simpleInline border construct = try $ do then body else B.spanWith attr body -groupedInlineMarkup :: PandocMonad m => ParserT Text ParserState m Inlines +groupedInlineMarkup :: PandocMonad m => TextileParser m Inlines groupedInlineMarkup = try $ do char '[' sp1 <- option mempty $ B.space <$ whitespace @@ -707,5 +713,5 @@ groupedInlineMarkup = try $ do char ']' return $ sp1 <> result <> sp2 -eof' :: Monad m => ParserT Text s m Char +eof' :: Monad m => ParserT Sources s m Char eof' = '\n' <$ eof diff --git a/src/Text/Pandoc/Readers/TikiWiki.hs b/src/Text/Pandoc/Readers/TikiWiki.hs index fb4b662c5..5c414fdec 100644 --- a/src/Text/Pandoc/Readers/TikiWiki.hs +++ b/src/Text/Pandoc/Readers/TikiWiki.hs @@ -30,23 +30,23 @@ import Text.Pandoc.Definition import Text.Pandoc.Logging (Verbosity (..)) import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (enclosed, nested) -import Text.Pandoc.Shared (crFilter, safeRead) +import Text.Pandoc.Shared (safeRead) import Text.Pandoc.XML (fromEntities) import Text.Printf (printf) -- | Read TikiWiki from an input string and return a Pandoc document. -readTikiWiki :: PandocMonad m +readTikiWiki :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc readTikiWiki opts s = do - res <- readWithM parseTikiWiki def{ stateOptions = opts } - (crFilter s <> "\n\n") + let sources = ensureFinalNewlines 2 (toSources s) + res <- readWithM parseTikiWiki def{ stateOptions = opts } sources case res of Left e -> throwError e Right d -> return d -type TikiWikiParser = ParserT Text ParserState +type TikiWikiParser = ParserT Sources ParserState -- -- utility functions diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs index 474e4fac0..b5cf5a0f3 100644 --- a/src/Text/Pandoc/Readers/Txt2Tags.hs +++ b/src/Text/Pandoc/Readers/Txt2Tags.hs @@ -1,4 +1,5 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE LambdaCase #-} {- | Module : Text.Pandoc.Readers.Txt2Tags Copyright : Copyright (C) 2014 Matthew Pickering @@ -19,6 +20,7 @@ import Control.Monad.Except (catchError, throwError) import Control.Monad.Reader (Reader, asks, runReader) import Data.Default import Data.List (intercalate, transpose) +import Data.List.NonEmpty (nonEmpty) import Data.Maybe (fromMaybe) import Data.Text (Text) import qualified Data.Text as T @@ -31,9 +33,9 @@ import Data.Time (defaultTimeLocale) import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (space, spaces, uri) -import Text.Pandoc.Shared (compactify, compactifyDL, crFilter, escapeURI) +import Text.Pandoc.Shared (compactify, compactifyDL, escapeURI) -type T2T = ParserT Text ParserState (Reader T2TMeta) +type T2T = ParserT Sources ParserState (Reader T2TMeta) -- | An object for the T2T macros meta information -- the contents of each field is simply substituted verbatim into the file @@ -53,25 +55,28 @@ getT2TMeta = do inps <- P.getInputFiles outp <- fromMaybe "" <$> P.getOutputFile curDate <- formatTime defaultTimeLocale "%F" <$> P.getZonedTime - let getModTime = fmap (formatTime defaultTimeLocale "%T") . - P.getModificationTime - curMtime <- case inps of - [] -> formatTime defaultTimeLocale "%T" <$> P.getZonedTime - _ -> catchError - (maximum <$> mapM getModTime inps) - (const (return "")) - return $ T2TMeta (T.pack curDate) (T.pack curMtime) (intercalate ", " inps) outp + curMtime <- catchError + (mapM P.getModificationTime inps >>= + (\case + Nothing -> + formatTime defaultTimeLocale "%T" <$> P.getZonedTime + Just ts -> return $ + formatTime defaultTimeLocale "%T" $ maximum ts) + . nonEmpty) + (const (return "")) + return $ T2TMeta (T.pack curDate) (T.pack curMtime) + (intercalate ", " inps) outp -- | Read Txt2Tags from an input string returning a Pandoc document -readTxt2Tags :: PandocMonad m +readTxt2Tags :: (PandocMonad m, ToSources a) => ReaderOptions - -> Text + -> a -> m Pandoc readTxt2Tags opts s = do + let sources = ensureFinalNewlines 2 (toSources s) meta <- getT2TMeta let parsed = flip runReader meta $ - readWithM parseT2T (def {stateOptions = opts}) $ - crFilter s <> "\n\n" + readWithM parseT2T (def {stateOptions = opts}) sources case parsed of Right result -> return result Left e -> throwError e @@ -261,9 +266,9 @@ table = try $ do rows <- many1 (many commentLine *> tableRow) let columns = transpose rows let ncolumns = length columns - let aligns = map (foldr1 findAlign . map fst) columns + let aligns = map (fromMaybe AlignDefault . foldr findAlign Nothing) columns let rows' = map (map snd) rows - let size = maximum (map length rows') + let size = maybe 0 maximum $ nonEmpty $ map length rows' let rowsPadded = map (pad size) rows' let headerPadded = if null tableHeader then mempty else pad size tableHeader let toRow = Row nullAttr . map B.simpleCell @@ -278,10 +283,11 @@ pad :: (Monoid a) => Int -> [a] -> [a] pad n xs = xs ++ replicate (n - length xs) mempty -findAlign :: Alignment -> Alignment -> Alignment -findAlign x y - | x == y = x - | otherwise = AlignDefault +findAlign :: (Alignment, a) -> Maybe Alignment -> Maybe Alignment +findAlign (x,_) (Just y) + | x == y = Just x + | otherwise = Just AlignDefault +findAlign (x,_) Nothing = Just x headerRow :: T2T [(Alignment, Blocks)] headerRow = genericRow (string "||") @@ -472,9 +478,29 @@ macro = try $ do -- raw URLs in text are automatically linked url :: T2T Inlines url = try $ do - (rawUrl, escapedUrl) <- try uri <|> emailAddress + (rawUrl, escapedUrl) <- try uri <|> emailAddress' return $ B.link rawUrl "" (B.str escapedUrl) +emailAddress' :: T2T (Text, Text) +emailAddress' = do + (base, mailURI) <- emailAddress + query <- option "" emailQuery + return (base <> query, mailURI <> query) + +emailQuery :: T2T Text +emailQuery = do + char '?' + parts <- kv `sepBy1` (char '&') + return $ "?" <> T.intercalate "&" parts + +kv :: T2T Text +kv = do + k <- T.pack <$> many1 alphaNum + char '=' + let vchar = alphaNum <|> try (oneOf "%._/~:,=$@&+-;*" <* lookAhead alphaNum) + v <- T.pack <$> many1 vchar + return (k <> "=" <> v) + uri :: T2T (Text, Text) uri = try $ do address <- t2tURI @@ -564,7 +590,7 @@ getTarget = do _ -> "html" atStart :: T2T () -atStart = (sourceColumn <$> getPosition) >>= guard . (== 1) +atStart = getPosition >>= guard . (== 1) . sourceColumn ignoreSpacesCap :: T2T Text -> T2T Text ignoreSpacesCap p = T.toLower <$> (spaces *> p <* spaces) diff --git a/src/Text/Pandoc/Readers/Vimwiki.hs b/src/Text/Pandoc/Readers/Vimwiki.hs index 74dac5ea7..460f304c4 100644 --- a/src/Text/Pandoc/Readers/Vimwiki.hs +++ b/src/Text/Pandoc/Readers/Vimwiki.hs @@ -74,23 +74,28 @@ import Text.Pandoc.Parsing (ParserState, ParserT, blanklines, emailAddress, many1Till, orderedListMarker, readWithM, registerHeader, spaceChar, stateMeta, stateOptions, uri, manyTillChar, manyChar, textStr, - many1Char, countChar, many1TillChar) -import Text.Pandoc.Shared (crFilter, splitTextBy, stringify, stripFirstAndLast, + many1Char, countChar, many1TillChar, + alphaNum, anyChar, char, newline, noneOf, oneOf, + space, spaces, string) +import Text.Pandoc.Sources (ToSources(..), Sources) +import Text.Pandoc.Shared (splitTextBy, stringify, stripFirstAndLast, isURI, tshow) -import Text.Parsec.Char (alphaNum, anyChar, char, newline, noneOf, oneOf, space, - spaces, string) import Text.Parsec.Combinator (between, choice, eof, lookAhead, many1, manyTill, notFollowedBy, option, skipMany1) import Text.Parsec.Prim (getState, many, try, updateState, (<|>)) -readVimwiki :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readVimwiki :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc readVimwiki opts s = do - res <- readWithM parseVimwiki def{ stateOptions = opts } $ crFilter s + let sources = toSources s + res <- readWithM parseVimwiki def{ stateOptions = opts } sources case res of Left e -> throwError e Right result -> return result -type VwParser = ParserT Text ParserState +type VwParser = ParserT Sources ParserState -- constants diff --git a/src/Text/Pandoc/RoffChar.hs b/src/Text/Pandoc/RoffChar.hs index 67e8b9cd5..d1c38204f 100644 --- a/src/Text/Pandoc/RoffChar.hs +++ b/src/Text/Pandoc/RoffChar.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.RoffChar - Copyright : Copyright (C) 2007-2020 John MacFarlane + Copyright : Copyright (C) 2007-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs index 061361aba..3bbab4bbe 100644 --- a/src/Text/Pandoc/SelfContained.hs +++ b/src/Text/Pandoc/SelfContained.hs @@ -2,7 +2,7 @@ {-# LANGUAGE TupleSections #-} {- | Module : Text.Pandoc.SelfContained - Copyright : Copyright (C) 2011-2020 John MacFarlane + Copyright : Copyright (C) 2011-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -16,7 +16,6 @@ the HTML using data URIs. module Text.Pandoc.SelfContained ( makeDataURI, makeSelfContained ) where import Codec.Compression.GZip as Gzip import Control.Applicative ((<|>)) -import Control.Monad.Except (throwError) import Control.Monad.Trans (lift) import Data.ByteString (ByteString) import Data.ByteString.Base64 @@ -29,7 +28,6 @@ import System.FilePath (takeDirectory, takeExtension, (</>)) import Text.HTML.TagSoup import Text.Pandoc.Class.PandocMonad (PandocMonad (..), fetchItem, getInputFiles, report, setInputFiles) -import Text.Pandoc.Error import Text.Pandoc.Logging import Text.Pandoc.MIME (MimeType) import Text.Pandoc.Shared (isURI, renderTags', trim) @@ -244,11 +242,10 @@ getData mimetype src let raw' = if ext `elem` [".gz", ".svgz"] then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks [raw] else raw - mime <- case (mimetype, respMime) of - ("",Nothing) -> throwError $ PandocSomeError - $ "Could not determine mime type for `" <> src <> "'" - (x, Nothing) -> return x - (_, Just x ) -> return x + let mime = case (mimetype, respMime) of + ("",Nothing) -> "application/octet-stream" + (x, Nothing) -> x + (_, Just x ) -> x result <- if "text/css" `T.isPrefixOf` mime then do oldInputs <- getInputFiles diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 4853621c8..920edca7b 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -8,7 +8,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Shared - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -21,18 +21,11 @@ module Text.Pandoc.Shared ( -- * List processing splitBy, splitTextBy, - splitByIndices, - splitStringByIndices, splitTextByIndices, - substitute, ordNub, findM, -- * Text processing - ToString (..), - ToText (..), tshow, - backslashEscapes, - escapeStringUsing, elemText, notElemText, stripTrailingNewlines, @@ -70,10 +63,10 @@ module Text.Pandoc.Shared ( isTightList, taskListItemFromAscii, taskListItemToAscii, + handleTaskListItem, addMetaField, makeMeta, eastAsianLineBreakFilter, - underlineSpan, htmlSpanLikeElements, splitSentences, filterIpynbOutput, @@ -98,7 +91,7 @@ module Text.Pandoc.Shared ( safeRead, safeStrRead, -- * User data directory - defaultUserDataDirs, + defaultUserDataDir, -- * Version pandocVersion ) where @@ -112,7 +105,7 @@ import qualified Data.Bifunctor as Bifunctor import Data.Char (isAlpha, isLower, isSpace, isUpper, toLower, isAlphaNum, generalCategory, GeneralCategory(NonSpacingMark, SpacingCombiningMark, EnclosingMark, ConnectorPunctuation)) -import Data.List (find, intercalate, intersperse, stripPrefix, sortOn) +import Data.List (find, intercalate, intersperse, sortOn, foldl') import qualified Data.Map as M import Data.Maybe (mapMaybe, fromMaybe) import Data.Monoid (Any (..)) @@ -130,7 +123,7 @@ import Text.HTML.TagSoup (RenderOptions (..), Tag (..), renderOptions, import Text.Pandoc.Builder (Blocks, Inlines, ToMetaValue (..)) import qualified Text.Pandoc.Builder as B import Data.Time -import Text.Pandoc.Asciify (toAsciiChar) +import Text.Pandoc.Asciify (toAsciiText) import Text.Pandoc.Definition import Text.Pandoc.Extensions (Extensions, Extension(..), extensionEnabled) import Text.Pandoc.Generic (bottomUp) @@ -150,46 +143,31 @@ splitBy :: (a -> Bool) -> [a] -> [[a]] splitBy _ [] = [] splitBy isSep lst = let (first, rest) = break isSep lst - rest' = dropWhile isSep rest - in first:splitBy isSep rest' + in first:splitBy isSep (dropWhile isSep rest) +-- | Split text by groups of one or more separator. splitTextBy :: (Char -> Bool) -> T.Text -> [T.Text] splitTextBy isSep t | T.null t = [] | otherwise = let (first, rest) = T.break isSep t - rest' = T.dropWhile isSep rest - in first : splitTextBy isSep rest' - -splitByIndices :: [Int] -> [a] -> [[a]] -splitByIndices [] lst = [lst] -splitByIndices (x:xs) lst = first:splitByIndices (map (\y -> y - x) xs) rest - where (first, rest) = splitAt x lst - --- | Split string into chunks divided at specified indices. -splitStringByIndices :: [Int] -> [Char] -> [[Char]] -splitStringByIndices [] lst = [lst] -splitStringByIndices (x:xs) lst = - let (first, rest) = splitAt' x lst in - first : splitStringByIndices (map (\y -> y - x) xs) rest + in first : splitTextBy isSep (T.dropWhile isSep rest) splitTextByIndices :: [Int] -> T.Text -> [T.Text] -splitTextByIndices ns = fmap T.pack . splitStringByIndices ns . T.unpack +splitTextByIndices ns = splitTextByRelIndices (zipWith (-) ns (0:ns)) . T.unpack + where + splitTextByRelIndices [] cs = [T.pack cs] + splitTextByRelIndices (x:xs) cs = + let (first, rest) = splitAt' x cs + in T.pack first : splitTextByRelIndices xs rest +-- Note: don't replace this with T.splitAt, which is not sensitive +-- to character widths! splitAt' :: Int -> [Char] -> ([Char],[Char]) splitAt' _ [] = ([],[]) splitAt' n xs | n <= 0 = ([],xs) splitAt' n (x:xs) = (x:ys,zs) where (ys,zs) = splitAt' (n - charWidth x) xs --- | Replace each occurrence of one sublist in a list with another. -substitute :: (Eq a) => [a] -> [a] -> [a] -> [a] -substitute _ _ [] = [] -substitute [] _ xs = xs -substitute target replacement lst@(x:xs) = - case stripPrefix target lst of - Just lst' -> replacement ++ substitute target replacement lst' - Nothing -> x : substitute target replacement xs - ordNub :: (Ord a) => [a] -> [a] ordNub l = go Set.empty l where @@ -209,38 +187,9 @@ findM p = foldr go (pure Nothing) -- Text processing -- -class ToString a where - toString :: a -> String - -instance ToString String where - toString = id - -instance ToString T.Text where - toString = T.unpack - -class ToText a where - toText :: a -> T.Text - -instance ToText String where - toText = T.pack - -instance ToText T.Text where - toText = id - tshow :: Show a => a -> T.Text tshow = T.pack . show --- | Returns an association list of backslash escapes for the --- designated characters. -backslashEscapes :: [Char] -- ^ list of special characters to escape - -> [(Char, T.Text)] -backslashEscapes = map (\ch -> (ch, T.pack ['\\',ch])) - --- | Escape a string of characters, using an association list of --- characters and strings. -escapeStringUsing :: [(Char, T.Text)] -> T.Text -> T.Text -escapeStringUsing tbl = T.concatMap $ \c -> fromMaybe (T.singleton c) $ lookup c tbl - -- | @True@ exactly when the @Char@ appears in the @Text@. elemText :: Char -> T.Text -> Bool elemText c = T.any (== c) @@ -253,17 +202,24 @@ notElemText c = T.all (/= c) stripTrailingNewlines :: T.Text -> T.Text stripTrailingNewlines = T.dropWhileEnd (== '\n') +isWS :: Char -> Bool +isWS ' ' = True +isWS '\r' = True +isWS '\n' = True +isWS '\t' = True +isWS _ = False + -- | Remove leading and trailing space (including newlines) from string. trim :: T.Text -> T.Text -trim = T.dropAround (`elemText` " \r\n\t") +trim = T.dropAround isWS -- | Remove leading space (including newlines) from string. triml :: T.Text -> T.Text -triml = T.dropWhile (`elemText` " \r\n\t") +triml = T.dropWhile isWS -- | Remove trailing space (including newlines) from string. trimr :: T.Text -> T.Text -trimr = T.dropWhileEnd (`elemText` " \r\n\t") +trimr = T.dropWhileEnd isWS -- | Trim leading space and trailing space unless after \. trimMath :: T.Text -> T.Text @@ -274,7 +230,7 @@ trimMath = triml . T.reverse . stripBeginSpace . T.reverse -- no Text.spanEnd | Just ('\\', _) <- T.uncons suff = T.cons (T.last pref) suff | otherwise = suff where - (pref, suff) = T.span (`elemText` " \t\n\r") t + (pref, suff) = T.span isWS t -- | Strip leading and trailing characters from string stripFirstAndLast :: T.Text -> T.Text @@ -342,6 +298,7 @@ tabFilter tabStop = T.unlines . map go . T.lines (tabStop - (T.length s1 `mod` tabStop)) (T.pack " ") <> go (T.drop 1 s2) +{-# DEPRECATED crFilter "readers filter crs automatically" #-} -- | Strip out DOS line endings. crFilter :: T.Text -> T.Text crFilter = T.filter (/= '\r') @@ -483,22 +440,20 @@ plainToPara :: Block -> Block plainToPara (Plain ils) = Para ils plainToPara x = x + -- | Like @compactify@, but acts on items of definition lists. compactifyDL :: [(Inlines, [Blocks])] -> [(Inlines, [Blocks])] compactifyDL items = - let defs = concatMap snd items - in case reverse (concatMap B.toList defs) of - (Para x:xs) - | not (any isPara xs) -> - let (t,ds) = last items - lastDef = B.toList $ last ds - ds' = init ds ++ - if null lastDef - then [B.fromList lastDef] - else [B.fromList $ init lastDef ++ [Plain x]] - in init items ++ [(t, ds')] - | otherwise -> items - _ -> items + case reverse items of + ((t,ds):ys) -> + case reverse (map (reverse . B.toList) ds) of + ((Para x:xs) : zs) | not (any isPara xs) -> + reverse ys ++ + [(t, reverse (map B.fromList zs) ++ + [B.fromList (reverse (Plain x:xs))])] + _ -> items + _ -> items + -- | Combine a list of lines by adding hard linebreaks. combineLines :: [[Inline]] -> [Inline] @@ -532,7 +487,7 @@ inlineListToIdentifier exts = | otherwise = T.dropWhile (not . isAlpha) filterAscii | extensionEnabled Ext_ascii_identifiers exts - = T.pack . mapMaybe toAsciiChar . T.unpack + = toAsciiText | otherwise = id toIdent | extensionEnabled Ext_gfm_auto_identifiers exts = @@ -749,13 +704,6 @@ eastAsianLineBreakFilter = bottomUp go go xs = xs -{-# DEPRECATED underlineSpan "Use Text.Pandoc.Builder.underline instead" #-} --- | Builder for underline (deprecated). --- This probably belongs in Builder.hs in pandoc-types. --- Will be replaced once Underline is an element. -underlineSpan :: Inlines -> Inlines -underlineSpan = B.underline - -- | Set of HTML elements that are represented as Span with a class equal as -- the element tag itself. htmlSpanLikeElements :: Set.Set T.Text @@ -868,7 +816,7 @@ mapLeft = Bifunctor.first -- > collapseFilePath "parent/foo/.." == "parent" -- > collapseFilePath "/parent/foo/../../bar" == "/bar" collapseFilePath :: FilePath -> FilePath -collapseFilePath = Posix.joinPath . reverse . foldl go [] . splitDirectories +collapseFilePath = Posix.joinPath . reverse . foldl' go [] . splitDirectories where go rs "." = rs go r@(p:rs) ".." = case p of @@ -905,7 +853,6 @@ filteredFilesFromArchive zf f = fileAndBinary :: Archive -> FilePath -> Maybe (FilePath, BL.ByteString) fileAndBinary a fp = findEntryByPath fp a >>= \e -> Just (fp, fromEntry e) - -- -- IANA URIs -- @@ -1038,12 +985,16 @@ safeStrRead s = case reads s of -- -- | Return appropriate user data directory for platform. We use --- XDG_DATA_HOME (or its default value), but fall back to the --- legacy user data directory ($HOME/.pandoc on *nix) if this is --- missing. -defaultUserDataDirs :: IO [FilePath] -defaultUserDataDirs = E.catch (do +-- XDG_DATA_HOME (or its default value), but for backwards compatibility, +-- we fall back to the legacy user data directory ($HOME/.pandoc on *nix) +-- if the XDG_DATA_HOME is missing and this exists. If neither directory +-- is present, we return the XDG data directory. +defaultUserDataDir :: IO FilePath +defaultUserDataDir = do xdgDir <- getXdgDirectory XdgData "pandoc" legacyDir <- getAppUserDataDirectory "pandoc" - return $ ordNub [xdgDir, legacyDir]) - (\(_ :: E.SomeException) -> return []) + xdgExists <- doesDirectoryExist xdgDir + legacyDirExists <- doesDirectoryExist legacyDir + if not xdgExists && legacyDirExists + then return legacyDir + else return xdgDir diff --git a/src/Text/Pandoc/Slides.hs b/src/Text/Pandoc/Slides.hs index 9ea0389c9..a3e550b1f 100644 --- a/src/Text/Pandoc/Slides.hs +++ b/src/Text/Pandoc/Slides.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Slides - Copyright : Copyright (C) 2012-2020 John MacFarlane + Copyright : Copyright (C) 2012-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Sources.hs b/src/Text/Pandoc/Sources.hs new file mode 100644 index 000000000..5511ccfb8 --- /dev/null +++ b/src/Text/Pandoc/Sources.hs @@ -0,0 +1,195 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Sources + Copyright : Copyright (C) 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Defines Sources object to be used as input to pandoc parsers and redefines Char +parsers so they get source position information from it. +-} + +module Text.Pandoc.Sources + ( Sources(..) + , ToSources(..) + , UpdateSourcePos(..) + , sourcesToText + , initialSourceName + , addToSources + , ensureFinalNewlines + , addToInput + , satisfy + , oneOf + , noneOf + , anyChar + , char + , string + , newline + , space + , spaces + , letter + , digit + , hexDigit + , alphaNum + ) +where +import qualified Text.Parsec as P +import Text.Parsec (Stream(..), ParsecT) +import Text.Parsec.Pos as P +import Data.Text (Text) +import qualified Data.Text as T +import Data.Char (isSpace, isLetter, isAlphaNum, isDigit, isHexDigit) +import Data.String (IsString(..)) +import qualified Data.List.NonEmpty as NonEmpty + +-- | A list of inputs labeled with source positions. It is assumed +-- that the 'Text's have @\n@ line endings. +newtype Sources = Sources { unSources :: [(SourcePos, Text)] } + deriving (Show, Semigroup, Monoid) + +instance Monad m => Stream Sources m Char where + uncons (Sources []) = return Nothing + uncons (Sources ((pos,t):rest)) = + case T.uncons t of + Nothing -> uncons (Sources rest) + Just (c,t') -> return $ Just (c, Sources ((pos,t'):rest)) + +instance IsString Sources where + fromString s = Sources [(P.initialPos "", T.pack (filter (/='\r') s))] + +class ToSources a where + toSources :: a -> Sources + +instance ToSources Text where + toSources t = Sources [(P.initialPos "", T.filter (/='\r') t)] + +instance ToSources [(FilePath, Text)] where + toSources = Sources + . map (\(fp,t) -> + (P.initialPos fp, T.snoc (T.filter (/='\r') t) '\n')) + +instance ToSources Sources where + toSources = id + +sourcesToText :: Sources -> Text +sourcesToText (Sources xs) = mconcat $ map snd xs + +addToSources :: Monad m => SourcePos -> Text -> ParsecT Sources u m () +addToSources pos t = do + curpos <- P.getPosition + Sources xs <- P.getInput + let xs' = case xs of + [] -> [] + ((_,t'):rest) -> (curpos,t'):rest + P.setInput $ Sources ((pos, T.filter (/='\r') t):xs') + +ensureFinalNewlines :: Int -- ^ number of trailing newlines + -> Sources + -> Sources +ensureFinalNewlines n (Sources xs) = + case NonEmpty.nonEmpty xs of + Nothing -> Sources [(initialPos "", T.replicate n "\n")] + Just lst -> + case NonEmpty.last lst of + (spos, t) -> + case T.length (T.takeWhileEnd (=='\n') t) of + len | len >= n -> Sources xs + | otherwise -> Sources (NonEmpty.init lst ++ + [(spos, + t <> T.replicate (n - len) "\n")]) + +class UpdateSourcePos s c where + updateSourcePos :: SourcePos -> c -> s -> SourcePos + +instance UpdateSourcePos Text Char where + updateSourcePos pos c _ = updatePosChar pos c + +instance UpdateSourcePos Sources Char where + updateSourcePos pos c sources = + case sources of + Sources [] -> updatePosChar pos c + Sources ((_,t):(pos',_):_) + | T.null t -> pos' + Sources _ -> + case c of + '\n' -> incSourceLine (setSourceColumn pos 1) 1 + '\t' -> incSourceColumn pos (4 - ((sourceColumn pos - 1) `mod` 4)) + _ -> incSourceColumn pos 1 + +-- | Get name of first source in 'Sources'. +initialSourceName :: Sources -> FilePath +initialSourceName (Sources []) = "" +initialSourceName (Sources ((pos,_):_)) = sourceName pos + +-- | Add some text to the beginning of the input sources. +-- This simplifies code that expands macros. +addToInput :: Monad m => Text -> ParsecT Sources u m () +addToInput t = do + Sources xs <- P.getInput + case xs of + [] -> P.setInput $ Sources [(initialPos "",t)] + (pos,t'):rest -> P.setInput $ Sources ((pos, t <> t'):rest) + +-- We need to redefine the parsers in Text.Parsec.Char so that they +-- update source positions properly from the Sources stream. + +satisfy :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => (Char -> Bool) -> ParsecT s u m Char +satisfy f = P.tokenPrim show updateSourcePos matcher + where + matcher c = if f c then Just c else Nothing + +oneOf :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => [Char] -> ParsecT s u m Char +oneOf cs = satisfy (`elem` cs) + +noneOf :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => [Char] -> ParsecT s u m Char +noneOf cs = satisfy (`notElem` cs) + +anyChar :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => ParsecT s u m Char +anyChar = satisfy (const True) + +char :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => Char -> ParsecT s u m Char +char c = satisfy (== c) + +string :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => [Char] -> ParsecT s u m [Char] +string = mapM char + +newline :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => ParsecT s u m Char +newline = satisfy (== '\n') + +space :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => ParsecT s u m Char +space = satisfy isSpace + +spaces :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => ParsecT s u m () +spaces = P.skipMany space P.<?> "white space" + +letter :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => ParsecT s u m Char +letter = satisfy isLetter + +alphaNum :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => ParsecT s u m Char +alphaNum = satisfy isAlphaNum + +digit :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => ParsecT s u m Char +digit = satisfy isDigit + +hexDigit :: (Monad m, Stream s m Char, UpdateSourcePos s Char) + => ParsecT s u m Char +hexDigit = satisfy isHexDigit diff --git a/src/Text/Pandoc/Templates.hs b/src/Text/Pandoc/Templates.hs index 0c10b258d..7fd896641 100644 --- a/src/Text/Pandoc/Templates.hs +++ b/src/Text/Pandoc/Templates.hs @@ -4,7 +4,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Templates - Copyright : Copyright (C) 2009-2020 John MacFarlane + Copyright : Copyright (C) 2009-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -34,6 +34,7 @@ import Control.Monad.Except (catchError, throwError) import Data.Text (Text) import qualified Data.Text as T import Text.Pandoc.Error +import System.IO.Error (isDoesNotExistError) -- | Wrap a Monad in this if you want partials to -- be taken only from the default data files. @@ -70,6 +71,9 @@ getTemplate tp = UTF8.toText <$> PandocResourceNotFound _ -> -- see #5987 on reason for takeFileName readDataFile ("templates" </> takeFileName tp) + PandocIOError _ ioe | isDoesNotExistError ioe -> + -- see #5987 on reason for takeFileName + readDataFile ("templates" </> takeFileName tp) _ -> throwError e)) -- | Get default template for the specified writer. diff --git a/src/Text/Pandoc/Translations.hs b/src/Text/Pandoc/Translations.hs index 200d756f6..0c7d7ab23 100644 --- a/src/Text/Pandoc/Translations.hs +++ b/src/Text/Pandoc/Translations.hs @@ -4,7 +4,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Translations - Copyright : Copyright (C) 2017-2020 John MacFarlane + Copyright : Copyright (C) 2017-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/UTF8.hs b/src/Text/Pandoc/UTF8.hs index 4621e1765..4d5921faf 100644 --- a/src/Text/Pandoc/UTF8.hs +++ b/src/Text/Pandoc/UTF8.hs @@ -1,8 +1,7 @@ -{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.UTF8 - Copyright : Copyright (C) 2010-2020 John MacFarlane + Copyright : Copyright (C) 2010-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -40,67 +39,65 @@ where import qualified Data.ByteString.Char8 as B import qualified Data.ByteString.Lazy.Char8 as BL +import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.IO as TIO import qualified Data.Text.Encoding as T import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as TL import Prelude hiding (getContents, putStr, putStrLn, readFile, writeFile) import System.IO hiding (getContents, hGetContents, hPutStr, hPutStrLn, putStr, putStrLn, readFile, writeFile) -import qualified System.IO as IO -readFile :: FilePath -> IO String +readFile :: FilePath -> IO Text readFile f = do h <- openFile (encodePath f) ReadMode hGetContents h -getContents :: IO String +getContents :: IO Text getContents = hGetContents stdin -writeFileWith :: Newline -> FilePath -> String -> IO () +writeFileWith :: Newline -> FilePath -> Text -> IO () writeFileWith eol f s = withFile (encodePath f) WriteMode $ \h -> hPutStrWith eol h s -writeFile :: FilePath -> String -> IO () +writeFile :: FilePath -> Text -> IO () writeFile = writeFileWith nativeNewline -putStrWith :: Newline -> String -> IO () +putStrWith :: Newline -> Text -> IO () putStrWith eol s = hPutStrWith eol stdout s -putStr :: String -> IO () +putStr :: Text -> IO () putStr = putStrWith nativeNewline -putStrLnWith :: Newline -> String -> IO () +putStrLnWith :: Newline -> Text -> IO () putStrLnWith eol s = hPutStrLnWith eol stdout s -putStrLn :: String -> IO () +putStrLn :: Text -> IO () putStrLn = putStrLnWith nativeNewline -hPutStrWith :: Newline -> Handle -> String -> IO () +hPutStrWith :: Newline -> Handle -> Text -> IO () hPutStrWith eol h s = hSetNewlineMode h (NewlineMode eol eol) >> - hSetEncoding h utf8 >> IO.hPutStr h s + hSetEncoding h utf8 >> TIO.hPutStr h s -hPutStr :: Handle -> String -> IO () +hPutStr :: Handle -> Text -> IO () hPutStr = hPutStrWith nativeNewline -hPutStrLnWith :: Newline -> Handle -> String -> IO () +hPutStrLnWith :: Newline -> Handle -> Text -> IO () hPutStrLnWith eol h s = hSetNewlineMode h (NewlineMode eol eol) >> - hSetEncoding h utf8 >> IO.hPutStrLn h s + hSetEncoding h utf8 >> TIO.hPutStrLn h s -hPutStrLn :: Handle -> String -> IO () +hPutStrLn :: Handle -> Text -> IO () hPutStrLn = hPutStrLnWith nativeNewline -hGetContents :: Handle -> IO String -hGetContents = fmap toString . B.hGetContents --- hGetContents h = hSetEncoding h utf8_bom --- >> hSetNewlineMode h universalNewlineMode --- >> IO.hGetContents h +hGetContents :: Handle -> IO Text +hGetContents = fmap toText . B.hGetContents -- | Convert UTF8-encoded ByteString to Text, also -- removing '\r' characters. -toText :: B.ByteString -> T.Text +toText :: B.ByteString -> Text toText = T.decodeUtf8 . filterCRs . dropBOM where dropBOM bs = if "\xEF\xBB\xBF" `B.isPrefixOf` bs @@ -128,7 +125,7 @@ toTextLazy = TL.decodeUtf8 . filterCRs . dropBOM toStringLazy :: BL.ByteString -> String toStringLazy = TL.unpack . toTextLazy -fromText :: T.Text -> B.ByteString +fromText :: Text -> B.ByteString fromText = T.encodeUtf8 fromTextLazy :: TL.Text -> BL.ByteString diff --git a/src/Text/Pandoc/UUID.hs b/src/Text/Pandoc/UUID.hs index ca0df2d0b..12579be90 100644 --- a/src/Text/Pandoc/UUID.hs +++ b/src/Text/Pandoc/UUID.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.UUID - Copyright : Copyright (C) 2010-2020 John MacFarlane + Copyright : Copyright (C) 2010-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Writers.hs b/src/Text/Pandoc/Writers.hs index 0654c2d85..c348477c2 100644 --- a/src/Text/Pandoc/Writers.hs +++ b/src/Text/Pandoc/Writers.hs @@ -4,7 +4,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -21,6 +21,8 @@ module Text.Pandoc.Writers , writeAsciiDoc , writeAsciiDoctor , writeBeamer + , writeBibTeX + , writeBibLaTeX , writeCommonMark , writeConTeXt , writeCustom @@ -79,12 +81,14 @@ import Data.Aeson import qualified Data.ByteString.Lazy as BL import Data.Text (Text) import qualified Data.Text as T +import Text.Pandoc.Shared (tshow) import Text.Pandoc.Class import Text.Pandoc.Definition import Text.Pandoc.Options import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Error import Text.Pandoc.Writers.AsciiDoc +import Text.Pandoc.Writers.BibTeX import Text.Pandoc.Writers.CommonMark import Text.Pandoc.Writers.ConTeXt import Text.Pandoc.Writers.CslJson @@ -119,7 +123,6 @@ import Text.Pandoc.Writers.Texinfo import Text.Pandoc.Writers.Textile import Text.Pandoc.Writers.XWiki import Text.Pandoc.Writers.ZimWiki -import Text.Parsec.Error data Writer m = TextWriter (WriterOptions -> Pandoc -> m Text) | ByteStringWriter (WriterOptions -> Pandoc -> m BL.ByteString) @@ -185,14 +188,16 @@ writers = [ ,("tei" , TextWriter writeTEI) ,("muse" , TextWriter writeMuse) ,("csljson" , TextWriter writeCslJson) + ,("bibtex" , TextWriter writeBibTeX) + ,("biblatex" , TextWriter writeBibLaTeX) ] -- | Retrieve writer, extensions based on formatSpec (format+extensions). getWriter :: PandocMonad m => Text -> m (Writer m, Extensions) getWriter s = case parseFormatSpec s of - Left e -> throwError $ PandocAppError - $ T.intercalate "\n" [T.pack m | Message m <- errorMessages e] + Left e -> throwError $ PandocAppError $ + "Error parsing writer format " <> tshow s <> ": " <> tshow e Right (writerName, extsToEnable, extsToDisable) -> case lookup writerName writers of Nothing -> throwError $ diff --git a/src/Text/Pandoc/Writers/AnnotatedTable.hs b/src/Text/Pandoc/Writers/AnnotatedTable.hs index 48c9d61f2..3f69496a9 100644 --- a/src/Text/Pandoc/Writers/AnnotatedTable.hs +++ b/src/Text/Pandoc/Writers/AnnotatedTable.hs @@ -1,8 +1,12 @@ {-# LANGUAGE BangPatterns #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE TupleSections #-} +{-# LANGUAGE UndecidableInstances #-} {- | Module : Text.Pandoc.Writers.AnnotatedTable @@ -45,6 +49,7 @@ import Data.Generics ( Data import Data.List.NonEmpty ( NonEmpty(..) ) import GHC.Generics ( Generic ) import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Walk ( Walkable (..) ) -- | An annotated table type, corresponding to the Pandoc 'B.Table' -- constructor and the HTML @\<table\>@ element. It records the data @@ -298,3 +303,21 @@ fromBodyRow (BodyRow attr _ rh rb) = fromCell :: Cell -> B.Cell fromCell (Cell _ _ c) = c + +-- +-- Instances +-- +instance Walkable a B.Cell => Walkable a Cell where + walkM f (Cell colspecs colnum cell) = + Cell colspecs colnum <$> walkM f cell + query f (Cell _colspecs _colnum cell) = query f cell + +instance Walkable a B.Cell => Walkable a HeaderRow where + walkM f (HeaderRow attr rownum cells) = + HeaderRow attr rownum <$> walkM f cells + query f (HeaderRow _attr _rownum cells) = query f cells + +instance Walkable a B.Cell => Walkable a TableHead where + walkM f (TableHead attr rows) = + TableHead attr <$> walkM f rows + query f (TableHead _attr rows) = query f rows diff --git a/src/Text/Pandoc/Writers/AsciiDoc.hs b/src/Text/Pandoc/Writers/AsciiDoc.hs index e742577b6..ab7e5f1a9 100644 --- a/src/Text/Pandoc/Writers/AsciiDoc.hs +++ b/src/Text/Pandoc/Writers/AsciiDoc.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Writers.AsciiDoc - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -22,6 +22,7 @@ module Text.Pandoc.Writers.AsciiDoc (writeAsciiDoc, writeAsciiDoctor) where import Control.Monad.State.Strict import Data.Char (isPunctuation, isSpace) import Data.List (intercalate, intersperse) +import Data.List.NonEmpty (NonEmpty(..)) import Data.Maybe (fromMaybe, isJust) import qualified Data.Set as Set import qualified Data.Text as T @@ -37,6 +38,7 @@ import Text.Pandoc.Shared import Text.Pandoc.Templates (renderTemplate) import Text.Pandoc.Writers.Shared + data WriterState = WriterState { defListMarker :: Text , orderedListLevel :: Int , bulletListLevel :: Int @@ -45,6 +47,10 @@ data WriterState = WriterState { defListMarker :: Text , asciidoctorVariant :: Bool , inList :: Bool , hasMath :: Bool + -- |0 is no table + -- 1 is top level table + -- 2 is a table in a table + , tableNestingLevel :: Int } defaultWriterState :: WriterState @@ -56,6 +62,7 @@ defaultWriterState = WriterState { defListMarker = "::" , asciidoctorVariant = False , inList = False , hasMath = False + , tableNestingLevel = 0 } -- | Convert Pandoc to AsciiDoc. @@ -98,8 +105,11 @@ pandocToAsciiDoc opts (Pandoc meta blocks) = do -- | Escape special characters for AsciiDoc. escapeString :: Text -> Text -escapeString = escapeStringUsing escs - where escs = backslashEscapes "{" +escapeString t + | T.any (== '{') t = T.concatMap escChar t + | otherwise = t + where escChar '{' = "\\{" + escChar c = T.singleton c -- | Ordered list start parser for use in Para below. olMarker :: Parser Text ParserState Char @@ -194,7 +204,7 @@ blockToAsciiDoc opts (BlockQuote blocks) = do else contents let bar = text "____" return $ bar $$ chomp contents' $$ bar <> blankline -blockToAsciiDoc opts (Table _ blkCapt specs thead tbody tfoot) = do +blockToAsciiDoc opts block@(Table _ blkCapt specs thead tbody tfoot) = do let (caption, aligns, widths, headers, rows) = toLegacyTable blkCapt specs thead tbody tfoot caption' <- inlineListToAsciiDoc opts caption @@ -236,23 +246,42 @@ blockToAsciiDoc opts (Table _ blkCapt specs thead tbody tfoot) = do $ zipWith colspec aligns widths') <> text "," <> headerspec <> text "]" + + -- construct cells and recurse in case of nested tables + parentTableLevel <- gets tableNestingLevel + let currentNestingLevel = parentTableLevel + 1 + + modify $ \st -> st{ tableNestingLevel = currentNestingLevel } + + let separator = text (if parentTableLevel == 0 + then "|" -- top level separator + else "!") -- nested separator + let makeCell [Plain x] = do d <- blockListToAsciiDoc opts [Plain x] - return $ text "|" <> chomp d + return $ separator <> chomp d makeCell [Para x] = makeCell [Plain x] - makeCell [] = return $ text "|" - makeCell bs = do d <- blockListToAsciiDoc opts bs - return $ text "a|" $$ d + makeCell [] = return separator + makeCell bs = if currentNestingLevel == 2 + then do + --asciidoc only supports nesting once + report $ BlockNotRendered block + return separator + else do + d <- blockListToAsciiDoc opts bs + return $ (text "a" <> separator) $$ d + let makeRow cells = hsep `fmap` mapM makeCell cells rows' <- mapM makeRow rows head' <- makeRow headers + modify $ \st -> st{ tableNestingLevel = parentTableLevel } let head'' = if all null headers then empty else head' let colwidth = if writerWrapText opts == WrapAuto then writerColumns opts else 100000 - let maxwidth = maximum $ map offset (head':rows') + let maxwidth = maximum $ fmap offset (head' :| rows') let body = if maxwidth > colwidth then vsep rows' else vcat rows' - let border = text "|===" - return $ + let border = separator <> text "===" + return $ caption'' $$ tablespec $$ border $$ head'' $$ body $$ border $$ blankline blockToAsciiDoc opts (BulletList items) = do inlist <- gets inList @@ -470,7 +499,9 @@ inlineToAsciiDoc opts (Quoted qt lst) = do | otherwise -> [Str "``"] ++ lst ++ [Str "''"] inlineToAsciiDoc _ (Code _ str) = do isAsciidoctor <- gets asciidoctorVariant - let contents = literal (escapeStringUsing (backslashEscapes "`") str) + let escChar '`' = "\\'" + escChar c = T.singleton c + let contents = literal (T.concatMap escChar str) return $ if isAsciidoctor then text "`+" <> contents <> "+`" diff --git a/src/Text/Pandoc/Writers/BibTeX.hs b/src/Text/Pandoc/Writers/BibTeX.hs new file mode 100644 index 000000000..95de6b71f --- /dev/null +++ b/src/Text/Pandoc/Writers/BibTeX.hs @@ -0,0 +1,61 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Writers.BibTeX + Copyright : Copyright (C) 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Writes a BibTeX or BibLaTeX bibliographies based on the +'references' metadata in a Pandoc document. +-} +module Text.Pandoc.Writers.BibTeX + ( writeBibTeX + , writeBibLaTeX + ) +where + +import Text.Pandoc.Options +import Text.Pandoc.Definition +import Data.Text (Text) +import Data.Maybe (mapMaybe) +import Citeproc (parseLang) +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Citeproc.BibTeX as BibTeX +import Text.Pandoc.Citeproc.MetaValue (metaValueToReference) +import Text.Pandoc.Writers.Shared (lookupMetaString, defField, + addVariablesToContext) +import Text.DocLayout (render, vcat) +import Text.DocTemplates (Context(..)) +import Text.Pandoc.Templates (renderTemplate) + +-- | Write BibTeX based on the references metadata from a Pandoc document. +writeBibTeX :: PandocMonad m => WriterOptions -> Pandoc -> m Text +writeBibTeX = writeBibTeX' BibTeX.Bibtex + +-- | Write BibLaTeX based on the references metadata from a Pandoc document. +writeBibLaTeX :: PandocMonad m => WriterOptions -> Pandoc -> m Text +writeBibLaTeX = writeBibTeX' BibTeX.Biblatex + +writeBibTeX' :: PandocMonad m => Variant -> WriterOptions -> Pandoc -> m Text +writeBibTeX' variant opts (Pandoc meta _) = do + let mblang = case lookupMetaString "lang" meta of + "" -> Nothing + t -> either (const Nothing) Just $ parseLang t + let refs = case lookupMeta "references" meta of + Just (MetaList xs) -> mapMaybe metaValueToReference xs + _ -> [] + let main = vcat $ map (BibTeX.writeBibtexString opts variant mblang) refs + let context = defField "body" main + $ addVariablesToContext opts (mempty :: Context Text) + let colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + return $ render colwidth $ + case writerTemplate opts of + Nothing -> main + Just tpl -> renderTemplate tpl context + + diff --git a/src/Text/Pandoc/Writers/CommonMark.hs b/src/Text/Pandoc/Writers/CommonMark.hs index 66ded218f..8733b7149 100644 --- a/src/Text/Pandoc/Writers/CommonMark.hs +++ b/src/Text/Pandoc/Writers/CommonMark.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Writers.CommonMark - Copyright : Copyright (C) 2015-2020 John MacFarlane + Copyright : Copyright (C) 2015-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Writers/ConTeXt.hs b/src/Text/Pandoc/Writers/ConTeXt.hs index 0a6313513..3cafcefba 100644 --- a/src/Text/Pandoc/Writers/ConTeXt.hs +++ b/src/Text/Pandoc/Writers/ConTeXt.hs @@ -3,7 +3,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Writers.ConTeXt - Copyright : Copyright (C) 2007-2020 John MacFarlane + Copyright : Copyright (C) 2007-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -20,7 +20,7 @@ import Data.Maybe (mapMaybe) import Data.Text (Text) import qualified Data.Text as T import Network.URI (unEscapeString) -import Text.Pandoc.BCP47 +import Text.Collate.Lang (Lang(..)) import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang) import Text.Pandoc.Definition import Text.Pandoc.ImageSize @@ -177,8 +177,12 @@ blockToConTeXt (Para lst) = do contents <- inlineListToConTeXt lst return $ contents <> blankline blockToConTeXt (LineBlock lns) = do - doclines <- nowrap . vcat <$> mapM inlineListToConTeXt lns - return $ "\\startlines" $$ doclines $$ "\\stoplines" <> blankline + let emptyToBlankline doc = if isEmpty doc + then blankline + else doc + doclines <- mapM inlineListToConTeXt lns + let contextLines = vcat . map emptyToBlankline $ doclines + return $ "\\startlines" $$ contextLines $$ "\\stoplines" <> blankline blockToConTeXt (BlockQuote lst) = do contents <- blockListToConTeXt lst return $ "\\startblockquote" $$ nest 0 contents $$ "\\stopblockquote" <> blankline @@ -228,13 +232,7 @@ blockToConTeXt (OrderedList (start, style', delim) lst) = do Period -> "stopper=." OneParen -> "stopper=)" TwoParens -> "left=(,stopper=)" - let width = maximum $ map T.length $ take (length contents) - (orderedListMarkers (start, style', delim)) - let width' = (toEnum width + 1) / 2 - let width'' = if width' > (1.5 :: Double) - then "width=" <> tshow width' <> "em" - else "" - let specs2Items = filter (not . T.null) [start', delim', width''] + let specs2Items = filter (not . T.null) [start', delim'] let specs2 = if null specs2Items then "" else "[" <> T.intercalate "," specs2Items <> "]" @@ -248,8 +246,8 @@ blockToConTeXt (OrderedList (start, style', delim) lst) = do UpperAlpha -> 'A') : if isTightList lst then ",packed]" else "]" let specs = T.pack style'' <> specs2 - return $ "\\startitemize" <> literal specs $$ vcat contents $$ - "\\stopitemize" <> blankline + return $ "\\startenumerate" <> literal specs $$ vcat contents $$ + "\\stopenumerate" <> blankline blockToConTeXt (DefinitionList lst) = liftM vcat $ mapM defListItemToConTeXt lst blockToConTeXt HorizontalRule = return $ "\\thinrule" <> blankline @@ -432,9 +430,13 @@ inlineToConTeXt (Link _ txt (src, _)) = do put $ st {stNextRef = next + 1} let ref = "url" <> tshow next contents <- inlineListToConTeXt txt + let escChar '#' = "\\#" + escChar '%' = "\\%" + escChar c = T.singleton c + let escContextURL = T.concatMap escChar return $ "\\useURL" <> brackets (literal ref) - <> brackets (literal $ escapeStringUsing [('#',"\\#"),('%',"\\%")] src) + <> brackets (literal $ escContextURL src) <> (if isAutolink then empty else brackets empty <> brackets contents) @@ -477,7 +479,7 @@ inlineToConTeXt (Note contents) = do then literal "\\footnote{" <> nest 2 (chomp contents') <> char '}' else literal "\\startbuffer " <> nest 2 (chomp contents') <> literal "\\stopbuffer\\footnote{\\getbuffer}" -inlineToConTeXt (Span (_,_,kvs) ils) = do +inlineToConTeXt (Span (ident,_,kvs) ils) = do mblang <- fromBCP47 (lookup "lang" kvs) let wrapDir txt = case lookup "dir" kvs of Just "rtl" -> braces $ "\\righttoleft " <> txt @@ -487,7 +489,11 @@ inlineToConTeXt (Span (_,_,kvs) ils) = do Just lng -> braces ("\\language" <> brackets (literal lng) <> txt) Nothing -> txt - wrapLang . wrapDir <$> inlineListToConTeXt ils + addReference = + if T.null ident + then id + else (("\\reference" <> brackets (literal ident) <> "{}") <>) + addReference . wrapLang . wrapDir <$> inlineListToConTeXt ils -- | Craft the section header, inserting the section reference, if supplied. sectionHeader :: PandocMonad m @@ -549,26 +555,26 @@ fromBCP47 mbs = fromBCP47' <$> toLang mbs -- https://tools.ietf.org/html/bcp47#section-2.1 -- http://wiki.contextgarden.net/Language_Codes fromBCP47' :: Maybe Lang -> Maybe Text -fromBCP47' (Just (Lang "ar" _ "SY" _) ) = Just "ar-sy" -fromBCP47' (Just (Lang "ar" _ "IQ" _) ) = Just "ar-iq" -fromBCP47' (Just (Lang "ar" _ "JO" _) ) = Just "ar-jo" -fromBCP47' (Just (Lang "ar" _ "LB" _) ) = Just "ar-lb" -fromBCP47' (Just (Lang "ar" _ "DZ" _) ) = Just "ar-dz" -fromBCP47' (Just (Lang "ar" _ "MA" _) ) = Just "ar-ma" -fromBCP47' (Just (Lang "de" _ _ ["1901"]) ) = Just "deo" -fromBCP47' (Just (Lang "de" _ "DE" _) ) = Just "de-de" -fromBCP47' (Just (Lang "de" _ "AT" _) ) = Just "de-at" -fromBCP47' (Just (Lang "de" _ "CH" _) ) = Just "de-ch" -fromBCP47' (Just (Lang "el" _ _ ["poly"]) ) = Just "agr" -fromBCP47' (Just (Lang "en" _ "US" _) ) = Just "en-us" -fromBCP47' (Just (Lang "en" _ "GB" _) ) = Just "en-gb" -fromBCP47' (Just (Lang "grc"_ _ _) ) = Just "agr" -fromBCP47' (Just (Lang "el" _ _ _) ) = Just "gr" -fromBCP47' (Just (Lang "eu" _ _ _) ) = Just "ba" -fromBCP47' (Just (Lang "he" _ _ _) ) = Just "il" -fromBCP47' (Just (Lang "jp" _ _ _) ) = Just "ja" -fromBCP47' (Just (Lang "uk" _ _ _) ) = Just "ua" -fromBCP47' (Just (Lang "vi" _ _ _) ) = Just "vn" -fromBCP47' (Just (Lang "zh" _ _ _) ) = Just "cn" -fromBCP47' (Just (Lang l _ _ _) ) = Just l -fromBCP47' Nothing = Nothing +fromBCP47' (Just (Lang "ar" _ (Just "SY") _ _ _)) = Just "ar-sy" +fromBCP47' (Just (Lang "ar" _ (Just "IQ") _ _ _)) = Just "ar-iq" +fromBCP47' (Just (Lang "ar" _ (Just "JO") _ _ _)) = Just "ar-jo" +fromBCP47' (Just (Lang "ar" _ (Just "LB") _ _ _)) = Just "ar-lb" +fromBCP47' (Just (Lang "ar" _ (Just "DZ") _ _ _)) = Just "ar-dz" +fromBCP47' (Just (Lang "ar" _ (Just "MA") _ _ _)) = Just "ar-ma" +fromBCP47' (Just (Lang "de" _ _ ["1901"] _ _)) = Just "deo" +fromBCP47' (Just (Lang "de" _ (Just "DE") _ _ _)) = Just "de-de" +fromBCP47' (Just (Lang "de" _ (Just "AT") _ _ _)) = Just "de-at" +fromBCP47' (Just (Lang "de" _ (Just "CH") _ _ _)) = Just "de-ch" +fromBCP47' (Just (Lang "el" _ _ ["poly"] _ _)) = Just "agr" +fromBCP47' (Just (Lang "en" _ (Just "US") _ _ _)) = Just "en-us" +fromBCP47' (Just (Lang "en" _ (Just "GB") _ _ _)) = Just "en-gb" +fromBCP47' (Just (Lang "grc"_ _ _ _ _)) = Just "agr" +fromBCP47' (Just (Lang "el" _ _ _ _ _)) = Just "gr" +fromBCP47' (Just (Lang "eu" _ _ _ _ _)) = Just "ba" +fromBCP47' (Just (Lang "he" _ _ _ _ _)) = Just "il" +fromBCP47' (Just (Lang "jp" _ _ _ _ _)) = Just "ja" +fromBCP47' (Just (Lang "uk" _ _ _ _ _)) = Just "ua" +fromBCP47' (Just (Lang "vi" _ _ _ _ _)) = Just "vn" +fromBCP47' (Just (Lang "zh" _ _ _ _ _)) = Just "cn" +fromBCP47' (Just (Lang l _ _ _ _ _)) = Just l +fromBCP47' Nothing = Nothing diff --git a/src/Text/Pandoc/Writers/CslJson.hs b/src/Text/Pandoc/Writers/CslJson.hs index 08310de65..395335667 100644 --- a/src/Text/Pandoc/Writers/CslJson.hs +++ b/src/Text/Pandoc/Writers/CslJson.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Writers.CslJson - Copyright : Copyright (C) 2020 John MacFarlane + Copyright : Copyright (C) 2020-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -34,23 +34,24 @@ import Control.Monad.Identity import Citeproc.Locale (getLocale) import Citeproc.CslJson import Text.Pandoc.Options (WriterOptions) -import Data.Maybe (mapMaybe) +import Data.Maybe (mapMaybe, fromMaybe) import Data.Aeson.Encode.Pretty (Config (..), Indent (Spaces), NumberFormat (Generic), defConfig, encodePretty') writeCslJson :: PandocMonad m => WriterOptions -> Pandoc -> m Text writeCslJson _opts (Pandoc meta _) = do - let lang = maybe (Lang "en" (Just "US")) parseLang - (lookupMeta "lang" meta >>= metaValueToText) + let lang = fromMaybe (Lang "en" Nothing (Just "US") [] [] []) + (lookupMeta "lang" meta >>= metaValueToText >>= + either (const Nothing) Just . parseLang) locale <- case getLocale lang of Left e -> throwError $ PandocCiteprocError e Right l -> return l - case lookupMeta "references" meta of - Just (MetaList rs) -> return $ (UTF8.toText $ - toCslJson locale (mapMaybe metaValueToReference rs)) - <> "\n" - _ -> throwError $ PandocAppError "No references field" + let rs = case lookupMeta "references" meta of + Just (MetaList xs) -> xs + _ -> [] + return $ UTF8.toText + (toCslJson locale (mapMaybe metaValueToReference rs)) <> "\n" fromInlines :: [Inline] -> CslJson Text fromInlines = foldMap fromInline . B.fromList diff --git a/src/Text/Pandoc/Writers/Custom.hs b/src/Text/Pandoc/Writers/Custom.hs index 8da611b61..58c4bb5be 100644 --- a/src/Text/Pandoc/Writers/Custom.hs +++ b/src/Text/Pandoc/Writers/Custom.hs @@ -2,7 +2,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Writers.Custom - Copyright : Copyright (C) 2012-2020 John MacFarlane + Copyright : Copyright (C) 2012-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index 408d8cc0c..33a6f5f0c 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -3,7 +3,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Writers.Docbook - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -15,6 +15,7 @@ Conversion of 'Pandoc' documents to Docbook XML. module Text.Pandoc.Writers.Docbook ( writeDocbook4, writeDocbook5 ) where import Control.Monad.Reader import Data.Generics (everywhere, mkT) +import Data.Maybe (isNothing) import Data.Monoid (Any (..)) import Data.Text (Text) import qualified Data.Text as T @@ -40,6 +41,25 @@ data DocBookVersion = DocBook4 | DocBook5 type DB = ReaderT DocBookVersion +-- | Get level of the top-level headers based on the configured top-level division. +-- The header level can then be used to determine appropriate DocBook element +-- for each subdivision associated with a header. +-- The numbering here follows LaTeX's internal numbering +getStartLvl :: WriterOptions -> Int +getStartLvl opts = + case writerTopLevelDivision opts of + TopLevelPart -> -1 + TopLevelChapter -> 0 + TopLevelSection -> 1 + TopLevelDefault -> 1 + +-- | Get correct name for the id attribute based on DocBook version. +-- DocBook 4 used custom id attribute but DocBook 5 adopted the xml:id specification. +-- https://www.w3.org/TR/xml-id/ +idName :: DocBookVersion -> Text +idName DocBook5 = "xml:id" +idName DocBook4 = "id" + -- | Convert list of authors to a docbook <author> section authorToDocbook :: PandocMonad m => WriterOptions -> [Inline] -> DB m B.Inlines authorToDocbook opts name' = do @@ -79,12 +99,7 @@ writeDocbook opts (Pandoc meta blocks) = do let colwidth = if writerWrapText opts == WrapAuto then Just $ writerColumns opts else Nothing - -- The numbering here follows LaTeX's internal numbering - let startLvl = case writerTopLevelDivision opts of - TopLevelPart -> -1 - TopLevelChapter -> 0 - TopLevelSection -> 1 - TopLevelDefault -> 1 + let startLvl = getStartLvl opts let fromBlocks = blocksToDocbook opts . makeSections False (Just startLvl) auths' <- mapM (authorToDocbook opts) $ docAuthors meta @@ -153,7 +168,7 @@ blockToDocbook :: PandocMonad m => WriterOptions -> Block -> DB m (Doc Text) blockToDocbook _ Null = return empty -- Add ids to paragraphs in divs with ids - this is needed for -- pandoc-citeproc to get link anchors in bibliographies: -blockToDocbook opts (Div (id',"section":_,_) (Header lvl _ ils : xs)) = do +blockToDocbook opts (Div (id',"section":_,_) (Header lvl (_,_,attrs) ils : xs)) = do version <- ask -- Docbook doesn't allow sections with no content, so insert some if needed let bs = if null xs @@ -166,28 +181,52 @@ blockToDocbook opts (Div (id',"section":_,_) (Header lvl _ ils : xs)) = do then "section" else "sect" <> tshow n _ -> "simplesect" - idName = if version == DocBook5 - then "xml:id" - else "id" - idAttr = [(idName, writerIdentifierPrefix opts <> id') | not (T.null id')] - nsAttr = if version == DocBook5 && lvl == 0 then [("xmlns", "http://docbook.org/ns/docbook"),("xmlns:xlink", "http://www.w3.org/1999/xlink")] - else [] - attribs = nsAttr <> idAttr + idAttr = [(idName version, writerIdentifierPrefix opts <> id') | not (T.null id')] + -- We want to add namespaces to the root (top-level) element. + nsAttr = if version == DocBook5 && lvl == getStartLvl opts && isNothing (writerTemplate opts) + -- Though, DocBook 4 does not support namespaces and + -- standalone documents will include them in the template. + then [("xmlns", "http://docbook.org/ns/docbook"),("xmlns:xlink", "http://www.w3.org/1999/xlink")] + else [] + + -- Populate miscAttr with Header.Attr.attributes, filtering out non-valid DocBook section attributes, id, and xml:id + miscAttr = filter (isSectionAttr version) attrs + attribs = nsAttr <> idAttr <> miscAttr title' <- inlinesToDocbook opts ils contents <- blocksToDocbook opts bs return $ inTags True tag attribs $ inTagsSimple "title" title' $$ contents -blockToDocbook opts (Div (ident,_,_) [Para lst]) = - let attribs = [("id", ident) | not (T.null ident)] in - if hasLineBreaks lst - then flush . nowrap . inTags False "literallayout" attribs - <$> inlinesToDocbook opts lst - else inTags True "para" attribs <$> inlinesToDocbook opts lst -blockToDocbook opts (Div (ident,_,_) bs) = do - contents <- blocksToDocbook opts (map plainToPara bs) - return $ - (if T.null ident - then mempty - else selfClosingTag "anchor" [("id", ident)]) $$ contents +blockToDocbook opts (Div (ident,classes,_) bs) = do + version <- ask + let identAttribs = [(idName version, ident) | not (T.null ident)] + admonitions = ["caution","danger","important","note","tip","warning"] + case classes of + (l:_) | l `elem` admonitions -> do + let (mTitleBs, bodyBs) = + case bs of + -- Matches AST produced by the DocBook reader → Markdown writer → Markdown reader chain. + (Div (_,["title"],_) [Para ts] : rest) -> (Just (inlinesToDocbook opts ts), rest) + -- Matches AST produced by the Docbook reader. + (Div (_,["title"],_) ts : rest) -> (Just (blocksToDocbook opts ts), rest) + _ -> (Nothing, bs) + admonitionTitle <- case mTitleBs of + Nothing -> return mempty + -- id will be attached to the admonition so let’s pass empty identAttrs. + Just titleBs -> inTags False "title" [] <$> titleBs + admonitionBody <- handleDivBody [] bodyBs + return (inTags True l identAttribs (admonitionTitle $$ admonitionBody)) + _ -> handleDivBody identAttribs bs + where + handleDivBody identAttribs [Para lst] = + if hasLineBreaks lst + then flush . nowrap . inTags False "literallayout" identAttribs + <$> inlinesToDocbook opts lst + else inTags True "para" identAttribs <$> inlinesToDocbook opts lst + handleDivBody identAttribs bodyBs = do + contents <- blocksToDocbook opts (map plainToPara bodyBs) + return $ + (if null identAttribs + then mempty + else selfClosingTag "anchor" identAttribs) $$ contents blockToDocbook _ h@Header{} = do -- should be handled by Div section above, except inside lists/blockquotes report $ BlockNotRendered h @@ -213,17 +252,18 @@ blockToDocbook opts (LineBlock lns) = blockToDocbook opts $ linesToPara lns blockToDocbook opts (BlockQuote blocks) = inTagsIndented "blockquote" <$> blocksToDocbook opts blocks -blockToDocbook _ (CodeBlock (_,classes,_) str) = return $ +blockToDocbook opts (CodeBlock (_,classes,_) str) = return $ literal ("<programlisting" <> lang <> ">") <> cr <> flush (literal (escapeStringForXML str) <> cr <> literal "</programlisting>") where lang = if null langs then "" else " language=\"" <> escapeStringForXML (head langs) <> "\"" - isLang l = T.toLower l `elem` map T.toLower languages + syntaxMap = writerSyntaxMap opts + isLang l = T.toLower l `elem` map T.toLower (languages syntaxMap) langsFrom s = if isLang s then [s] - else languagesByExtension . T.toLower $ s + else (languagesByExtension syntaxMap) . T.toLower $ s langs = concatMap langsFrom classes blockToDocbook opts (BulletList lst) = do let attribs = [("spacing", "compact") | isTightList lst] @@ -341,11 +381,12 @@ inlineToDocbook opts (Quoted _ lst) = inTagsSimple "quote" <$> inlinesToDocbook opts lst inlineToDocbook opts (Cite _ lst) = inlinesToDocbook opts lst -inlineToDocbook opts (Span (ident,_,_) ils) = +inlineToDocbook opts (Span (ident,_,_) ils) = do + version <- ask ((if T.null ident then mempty - else selfClosingTag "anchor" [("id", ident)]) <>) <$> - inlinesToDocbook opts ils + else selfClosingTag "anchor" [(idName version, ident)]) <>) <$> + inlinesToDocbook opts ils inlineToDocbook _ (Code _ str) = return $ inTagsSimple "literal" $ literal (escapeStringForXML str) inlineToDocbook opts (Math t str) @@ -413,3 +454,43 @@ idAndRole (id',cls,_) = ident <> role where ident = [("id", id') | not (T.null id')] role = [("role", T.unwords cls) | not (null cls)] + +isSectionAttr :: DocBookVersion -> (Text, Text) -> Bool +isSectionAttr _ ("label",_) = True +isSectionAttr _ ("status",_) = True +isSectionAttr DocBook5 ("annotations",_) = True +isSectionAttr DocBook5 ("dir","ltr") = True +isSectionAttr DocBook5 ("dir","rtl") = True +isSectionAttr DocBook5 ("dir","lro") = True +isSectionAttr DocBook5 ("dir","rlo") = True +isSectionAttr _ ("remap",_) = True +isSectionAttr _ ("revisionflag","changed") = True +isSectionAttr _ ("revisionflag","added") = True +isSectionAttr _ ("revisionflag","deleted") = True +isSectionAttr _ ("revisionflag","off") = True +isSectionAttr _ ("role",_) = True +isSectionAttr DocBook5 ("version",_) = True +isSectionAttr DocBook5 ("xml:base",_) = True +isSectionAttr DocBook5 ("xml:lang",_) = True +isSectionAttr _ ("xreflabel",_) = True +isSectionAttr DocBook5 ("linkend",_) = True +isSectionAttr DocBook5 ("linkends",_) = True +isSectionAttr DocBook5 ("xlink:actuate",_) = True +isSectionAttr DocBook5 ("xlink:arcrole",_) = True +isSectionAttr DocBook5 ("xlink:from",_) = True +isSectionAttr DocBook5 ("xlink:href",_) = True +isSectionAttr DocBook5 ("xlink:label",_) = True +isSectionAttr DocBook5 ("xlink:role",_) = True +isSectionAttr DocBook5 ("xlink:show",_) = True +isSectionAttr DocBook5 ("xlink:title",_) = True +isSectionAttr DocBook5 ("xlink:to",_) = True +isSectionAttr DocBook5 ("xlink:type",_) = True +isSectionAttr DocBook4 ("arch",_) = True +isSectionAttr DocBook4 ("condition",_) = True +isSectionAttr DocBook4 ("conformance",_) = True +isSectionAttr DocBook4 ("lang",_) = True +isSectionAttr DocBook4 ("os",_) = True +isSectionAttr DocBook4 ("revision",_) = True +isSectionAttr DocBook4 ("security",_) = True +isSectionAttr DocBook4 ("vendor",_) = True +isSectionAttr _ (_,_) = False diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs index a380fd4fa..a3c4b6be1 100644 --- a/src/Text/Pandoc/Writers/Docx.hs +++ b/src/Text/Pandoc/Writers/Docx.hs @@ -7,7 +7,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Writers.Docx - Copyright : Copyright (C) 2012-2020 John MacFarlane + Copyright : Copyright (C) 2012-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -22,7 +22,6 @@ import Control.Applicative ((<|>)) import Control.Monad.Except (catchError, throwError) import Control.Monad.Reader import Control.Monad.State.Strict -import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as BL import Data.Char (isSpace, isLetter) import Data.List (intercalate, isPrefixOf, isSuffixOf) @@ -31,13 +30,14 @@ import qualified Data.Map as M import Data.Maybe (fromMaybe, isNothing, mapMaybe, maybeToList) import qualified Data.Set as Set import qualified Data.Text as T +import Data.Text (Text) import qualified Data.Text.Lazy as TL import Data.Time.Clock.POSIX import Data.Digest.Pure.SHA (sha1, showDigest) import Skylighting -import System.Random (randomRs, mkStdGen) -import Text.Pandoc.BCP47 (getLang, renderLang) -import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang) +import Text.Collate.Lang (renderLang) +import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang, translateTerm) +import qualified Text.Pandoc.Translations as Term import qualified Text.Pandoc.Class.PandocMonad as P import Data.Time import Text.Pandoc.UTF8 (fromTextLazy) @@ -47,135 +47,37 @@ import Text.Pandoc.Highlighting (highlight) import Text.Pandoc.Error import Text.Pandoc.ImageSize import Text.Pandoc.Logging -import Text.Pandoc.MIME (MimeType, extensionFromMimeType, getMimeType, - getMimeTypeDef) +import Text.Pandoc.MIME (extensionFromMimeType, getMimeType, getMimeTypeDef) import Text.Pandoc.Options import Text.Pandoc.Writers.Docx.StyleMap +import Text.Pandoc.Writers.Docx.Table +import Text.Pandoc.Writers.Docx.Types import Text.Pandoc.Shared import Text.Pandoc.Walk +import qualified Text.Pandoc.Writers.GridTable as Grid import Text.Pandoc.Writers.Math import Text.Pandoc.Writers.Shared -import Text.Printf (printf) import Text.TeXMath -import Text.XML.Light as XML -import Text.XML.Light.Cursor as XMLC import Text.Pandoc.Writers.OOXML - -data ListMarker = NoMarker - | BulletMarker - | NumberMarker ListNumberStyle ListNumberDelim Int - deriving (Show, Read, Eq, Ord) - -listMarkerToId :: ListMarker -> String -listMarkerToId NoMarker = "990" -listMarkerToId BulletMarker = "991" -listMarkerToId (NumberMarker sty delim n) = - '9' : '9' : styNum : delimNum : show n - where styNum = case sty of - DefaultStyle -> '2' - Example -> '3' - Decimal -> '4' - LowerRoman -> '5' - UpperRoman -> '6' - LowerAlpha -> '7' - UpperAlpha -> '8' - delimNum = case delim of - DefaultDelim -> '0' - Period -> '1' - OneParen -> '2' - TwoParens -> '3' - -data EnvProps = EnvProps{ styleElement :: Maybe Element - , otherElements :: [Element] - } - -instance Semigroup EnvProps where - EnvProps s es <> EnvProps s' es' = EnvProps (s <|> s') (es ++ es') - -instance Monoid EnvProps where - mempty = EnvProps Nothing [] - mappend = (<>) +import Text.Pandoc.XML.Light as XML +import Data.Generics (mkT, everywhere) squashProps :: EnvProps -> [Element] squashProps (EnvProps Nothing es) = es squashProps (EnvProps (Just e) es) = e : es -data WriterEnv = WriterEnv{ envTextProperties :: EnvProps - , envParaProperties :: EnvProps - , envRTL :: Bool - , envListLevel :: Int - , envListNumId :: Int - , envInDel :: Bool - , envChangesAuthor :: T.Text - , envChangesDate :: T.Text - , envPrintWidth :: Integer - } - -defaultWriterEnv :: WriterEnv -defaultWriterEnv = WriterEnv{ envTextProperties = mempty - , envParaProperties = mempty - , envRTL = False - , envListLevel = -1 - , envListNumId = 1 - , envInDel = False - , envChangesAuthor = "unknown" - , envChangesDate = "1969-12-31T19:00:00Z" - , envPrintWidth = 1 - } - -data WriterState = WriterState{ - stFootnotes :: [Element] - , stComments :: [([(T.Text, T.Text)], [Inline])] - , stSectionIds :: Set.Set T.Text - , stExternalLinks :: M.Map String String - , stImages :: M.Map FilePath (String, String, Maybe MimeType, B.ByteString) - , stLists :: [ListMarker] - , stInsId :: Int - , stDelId :: Int - , stStyleMaps :: StyleMaps - , stFirstPara :: Bool - , stInTable :: Bool - , stInList :: Bool - , stTocTitle :: [Inline] - , stDynamicParaProps :: Set.Set ParaStyleName - , stDynamicTextProps :: Set.Set CharStyleName - , stCurId :: Int - } - -defaultWriterState :: WriterState -defaultWriterState = WriterState{ - stFootnotes = defaultFootnotes - , stComments = [] - , stSectionIds = Set.empty - , stExternalLinks = M.empty - , stImages = M.empty - , stLists = [NoMarker] - , stInsId = 1 - , stDelId = 1 - , stStyleMaps = StyleMaps M.empty M.empty - , stFirstPara = False - , stInTable = False - , stInList = False - , stTocTitle = [Str "Table of Contents"] - , stDynamicParaProps = Set.empty - , stDynamicTextProps = Set.empty - , stCurId = 20 - } - -type WS m = ReaderT WriterEnv (StateT WriterState m) - -renumIdMap :: Int -> [Element] -> M.Map String String +renumIdMap :: Int -> [Element] -> M.Map Text Text renumIdMap _ [] = M.empty renumIdMap n (e:es) | Just oldId <- findAttr (QName "Id" Nothing Nothing) e = - M.insert oldId ("rId" ++ show n) (renumIdMap (n+1) es) + M.insert oldId ("rId" <> tshow n) (renumIdMap (n+1) es) | otherwise = renumIdMap n es -replaceAttr :: (QName -> Bool) -> String -> [XML.Attr] -> [XML.Attr] +replaceAttr :: (QName -> Bool) -> Text -> [XML.Attr] -> [XML.Attr] replaceAttr f val = map $ \a -> if f (attrKey a) then XML.Attr (attrKey a) val else a -renumId :: (QName -> Bool) -> M.Map String String -> Element -> Element +renumId :: (QName -> Bool) -> M.Map Text Text -> Element -> Element renumId f renumMap e | Just oldId <- findAttrBy f e , Just newId <- M.lookup oldId renumMap = @@ -184,18 +86,12 @@ renumId f renumMap e e { elAttribs = attrs' } | otherwise = e -renumIds :: (QName -> Bool) -> M.Map String String -> [Element] -> [Element] +renumIds :: (QName -> Bool) -> M.Map Text Text -> [Element] -> [Element] renumIds f renumMap = map (renumId f renumMap) -findAttrTextBy :: (QName -> Bool) -> Element -> Maybe T.Text -findAttrTextBy x = fmap T.pack . findAttrBy x - -lookupAttrTextBy :: (QName -> Bool) -> [XML.Attr] -> Maybe T.Text -lookupAttrTextBy x = fmap T.pack . lookupAttrBy x - -- | Certain characters are invalid in XML even if escaped. -- See #1992 -stripInvalidChars :: T.Text -> T.Text +stripInvalidChars :: Text -> Text stripInvalidChars = T.filter isValidChar -- | See XML reference @@ -217,7 +113,7 @@ writeDocx opts doc = do let doc' = Pandoc meta blocks' username <- P.lookupEnv "USERNAME" - utctime <- P.getCurrentTime + utctime <- P.getTimestamp oldUserDataDir <- P.getUserDataDir P.setUserDataDir Nothing res <- P.readDefaultDataFile "reference.docx" @@ -234,11 +130,11 @@ writeDocx opts doc = do -- Gets the template size let mbpgsz = mbsectpr >>= filterElementName (wname (=="pgSz")) - let mbAttrSzWidth = mbpgsz >>= lookupAttrTextBy ((=="w") . qName) . elAttribs + let mbAttrSzWidth = mbpgsz >>= lookupAttrBy ((=="w") . qName) . elAttribs let mbpgmar = mbsectpr >>= filterElementName (wname (=="pgMar")) - let mbAttrMarLeft = mbpgmar >>= lookupAttrTextBy ((=="left") . qName) . elAttribs - let mbAttrMarRight = mbpgmar >>= lookupAttrTextBy ((=="right") . qName) . elAttribs + let mbAttrMarLeft = mbpgmar >>= lookupAttrBy ((=="left") . qName) . elAttribs + let mbAttrMarRight = mbpgmar >>= lookupAttrBy ((=="right") . qName) . elAttribs -- Get the available area (converting the size and the margins to int and -- doing the difference @@ -250,24 +146,21 @@ writeDocx opts doc = do -- styles mblang <- toLang $ getLang opts meta + -- TODO FIXME avoid this generic traversal! + -- lang is in w:docDefaults / w:rPr / w:lang let addLang :: Element -> Element - addLang e = case (\l -> XMLC.toTree . go (T.unpack $ renderLang l) $ - XMLC.fromElement e) <$> mblang of - Just (Elem e') -> e' - _ -> e -- return original - where go :: String -> Cursor -> Cursor - go l cursor = case XMLC.findRec (isLangElt . current) cursor of - Nothing -> cursor - Just t -> XMLC.modifyContent (setval l) t - setval :: String -> Content -> Content - setval l (Elem e') = Elem $ e'{ elAttribs = map (setvalattr l) $ - elAttribs e' } - setval _ x = x - setvalattr :: String -> XML.Attr -> XML.Attr - setvalattr l (XML.Attr qn@(QName "val" _ _) _) = XML.Attr qn l - setvalattr _ x = x - isLangElt (Elem e') = qName (elName e') == "lang" - isLangElt _ = False + addLang = case mblang of + Nothing -> id + Just l -> everywhere (mkT (go (renderLang l))) + where + go :: Text -> Element -> Element + go l e' + | qName (elName e') == "lang" + = e'{ elAttribs = map (setvalattr l) $ elAttribs e' } + | otherwise = e' + + setvalattr l (XML.Attr qn@(QName "val" _ _) _) = XML.Attr qn l + setvalattr _ x = x let stylepath = "word/styles.xml" styledoc <- addLang <$> parseXml refArchive distArchive stylepath @@ -337,12 +230,13 @@ writeDocx opts doc = do -- [Content_Types].xml let mkOverrideNode (part', contentType') = mknode "Override" - [("PartName",part'),("ContentType",contentType')] () + [("PartName", T.pack part') + ,("ContentType", contentType')] () let mkImageOverride (_, imgpath, mbMimeType, _) = - mkOverrideNode ("/word/" ++ imgpath, - maybe "application/octet-stream" T.unpack mbMimeType) + mkOverrideNode ("/word/" <> imgpath, + fromMaybe "application/octet-stream" mbMimeType) let mkMediaOverride imgpath = - mkOverrideNode ('/':imgpath, T.unpack $ getMimeTypeDef imgpath) + mkOverrideNode ("/" <> imgpath, getMimeTypeDef imgpath) let overrides = map mkOverrideNode ( [("/word/webSettings.xml", "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml") @@ -369,13 +263,14 @@ writeDocx opts doc = do ,("/word/footnotes.xml", "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml") ] ++ - map (\x -> (maybe "" ("/word/" ++) $ extractTarget x, + map (\x -> (maybe "" (T.unpack . ("/word/" <>)) (extractTarget x), "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml")) headers ++ - map (\x -> (maybe "" ("/word/" ++) $ extractTarget x, + map (\x -> (maybe "" (T.unpack . ("/word/" <>)) (extractTarget x), "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml")) footers) ++ map mkImageOverride imgs ++ - [ mkMediaOverride (eRelativePath e) | e <- zEntries refArchive - , "word/media/" `isPrefixOf` eRelativePath e ] + [ mkMediaOverride (eRelativePath e) + | e <- zEntries refArchive + , "word/media/" `isPrefixOf` eRelativePath e ] let defaultnodes = [mknode "Default" [("Extension","xml"),("ContentType","application/xml")] (), @@ -421,7 +316,7 @@ writeDocx opts doc = do let renumHeaders = renumIds (\q -> qName q == "Id") idMap headers let renumFooters = renumIds (\q -> qName q == "Id") idMap footers let baserels = baserels' ++ renumHeaders ++ renumFooters - let toImgRel (ident,path,_,_) = mknode "Relationship" [("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"),("Id",ident),("Target",path)] () + let toImgRel (ident,path,_,_) = mknode "Relationship" [("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"),("Id",T.pack ident),("Target",T.pack path)] () let imgrels = map toImgRel imgs let toLinkRel (src,ident) = mknode "Relationship" [("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"),("Id",ident),("Target",src),("TargetMode","External") ] () let linkrels = map toLinkRel $ M.toList $ stExternalLinks st @@ -441,7 +336,7 @@ writeDocx opts doc = do Nothing -> mknode "w:sectPr" [] () -- let sectpr = fromMaybe (mknode "w:sectPr" [] ()) mbsectpr' - let contents' = contents ++ [sectpr] + let contents' = contents ++ [Elem sectpr] let docContents = mknode "w:document" stdAttributes $ mknode "w:body" [] contents' @@ -489,10 +384,10 @@ writeDocx opts doc = do numbering <- parseXml refArchive distArchive numpath let newNumElts = mkNumbering (stLists st) let pandocAdded e = - case findAttrTextBy ((== "abstractNumId") . qName) e >>= safeRead of + case findAttrBy ((== "abstractNumId") . qName) e >>= safeRead of Just numid -> numid >= (990 :: Int) Nothing -> - case findAttrTextBy ((== "numId") . qName) e >>= safeRead of + case findAttrBy ((== "numId") . qName) e >>= safeRead of Just numid -> numid >= (1000 :: Int) Nothing -> False let oldElts = filter (not . pandocAdded) $ onlyElems (elContent numbering) @@ -514,7 +409,7 @@ writeDocx opts doc = do let extraCoreProps = ["subject","lang","category","description"] let extraCorePropsMap = M.fromList $ zip extraCoreProps ["dc:subject","dc:language","cp:category","dc:description"] - let lookupMetaString' :: T.Text -> Meta -> T.Text + let lookupMetaString' :: Text -> Meta -> Text lookupMetaString' key' meta' = case key' of "description" -> T.intercalate "_x000d_\n" (map stringify $ lookupMetaBlocks "description" meta') @@ -530,20 +425,21 @@ writeDocx opts doc = do : mktnode "dc:creator" [] (T.intercalate "; " (map stringify $ docAuthors meta)) : [ mktnode (M.findWithDefault "" k extraCorePropsMap) [] (lookupMetaString' k meta) | k <- M.keys (unMeta meta), k `elem` extraCoreProps] - ++ mknode "cp:keywords" [] (T.unpack $ T.intercalate ", " keywords) + ++ mknode "cp:keywords" [] (T.intercalate ", " keywords) : (\x -> [ mknode "dcterms:created" [("xsi:type","dcterms:W3CDTF")] x , mknode "dcterms:modified" [("xsi:type","dcterms:W3CDTF")] x - ]) (formatTime defaultTimeLocale "%FT%XZ" utctime) + ]) (T.pack $ formatTime defaultTimeLocale "%FT%XZ" utctime) let docPropsEntry = toEntry docPropsPath epochtime $ renderXml docProps -- docProps/custom.xml - let customProperties :: [(String, String)] - customProperties = [(T.unpack k, T.unpack $ lookupMetaString k meta) | k <- M.keys (unMeta meta) + let customProperties :: [(Text, Text)] + customProperties = [ (k, lookupMetaString k meta) + | k <- M.keys (unMeta meta) , k `notElem` (["title", "author", "keywords"] ++ extraCoreProps)] let mkCustomProp (k, v) pid = mknode "property" [("fmtid","{D5CDD505-2E9C-101B-9397-08002B2CF9AE}") - ,("pid", show pid) + ,("pid", tshow pid) ,("name", k)] $ mknode "vt:lpwstr" [] v let customPropsPath = "docProps/custom.xml" let customProps = mknode "Properties" @@ -574,12 +470,27 @@ writeDocx opts doc = do -- adds references to footnotes or endnotes we don't have... -- we do, however, copy some settings over from reference let settingsPath = "word/settings.xml" - settingsList = [ "w:autoHyphenation" - , "w:consecutiveHyphenLimit" - , "w:hyphenationZone" - , "w:doNotHyphenateCap" - , "w:evenAndOddHeaders" - , "w:proofState" + settingsList = [ "zoom" + , "embedSystemFonts" + , "doNotTrackMoves" + , "defaultTabStop" + , "drawingGridHorizontalSpacing" + , "drawingGridVerticalSpacing" + , "displayHorizontalDrawingGridEvery" + , "displayVerticalDrawingGridEvery" + , "characterSpacingControl" + , "savePreviewPicture" + , "mathPr" + , "themeFontLang" + , "decimalSymbol" + , "listSeparator" + , "autoHyphenation" + , "consecutiveHyphenLimit" + , "hyphenationZone" + , "doNotHyphenateCap" + , "evenAndOddHeaders" + , "proofState" + , "compat" ] settingsEntry <- copyChildren refArchive distArchive settingsPath epochtime settingsList @@ -593,7 +504,8 @@ writeDocx opts doc = do fontTableEntry <- entryFromArchive refArchive "word/fontTable.xml" webSettingsEntry <- entryFromArchive refArchive "word/webSettings.xml" headerFooterEntries <- mapM (entryFromArchive refArchive . ("word/" ++)) $ - mapMaybe extractTarget (headers ++ footers) + mapMaybe (fmap T.unpack . extractTarget) + (headers ++ footers) let miscRelEntries = [ e | e <- zEntries refArchive , "word/_rels/" `isPrefixOf` eRelativePath e , ".xml.rels" `isSuffixOf` eRelativePath e @@ -619,8 +531,8 @@ newParaPropToOpenXml (fromStyleName -> s) = let styleId = T.filter (not . isSpace) s in mknode "w:style" [ ("w:type", "paragraph") , ("w:customStyle", "1") - , ("w:styleId", T.unpack styleId)] - [ mknode "w:name" [("w:val", T.unpack s)] () + , ("w:styleId", styleId)] + [ mknode "w:name" [("w:val", s)] () , mknode "w:basedOn" [("w:val","BodyText")] () , mknode "w:qFormat" [] () ] @@ -630,8 +542,8 @@ newTextPropToOpenXml (fromStyleName -> s) = let styleId = T.filter (not . isSpace) s in mknode "w:style" [ ("w:type", "character") , ("w:customStyle", "1") - , ("w:styleId", T.unpack styleId)] - [ mknode "w:name" [("w:val", T.unpack s)] () + , ("w:styleId", styleId)] + [ mknode "w:name" [("w:val", s)] () , mknode "w:basedOn" [("w:val","BodyTextChar")] () ] @@ -642,13 +554,14 @@ styleToOpenXml sm style = toStyle toktype | hasStyleName (fromString $ show toktype) (smCharStyle sm) = Nothing | otherwise = Just $ mknode "w:style" [("w:type","character"), - ("w:customStyle","1"),("w:styleId",show toktype)] - [ mknode "w:name" [("w:val",show toktype)] () + ("w:customStyle","1"),("w:styleId", tshow toktype)] + [ mknode "w:name" [("w:val", tshow toktype)] () , mknode "w:basedOn" [("w:val","VerbatimChar")] () , mknode "w:rPr" [] $ - [ mknode "w:color" [("w:val",tokCol toktype)] () + [ mknode "w:color" [("w:val", tokCol toktype)] () | tokCol toktype /= "auto" ] ++ - [ mknode "w:shd" [("w:val","clear"),("w:fill",tokBg toktype)] () + [ mknode "w:shd" [("w:val","clear") + ,("w:fill",tokBg toktype)] () | tokBg toktype /= "auto" ] ++ [ mknode "w:b" [] () | tokFeature tokenBold toktype ] ++ [ mknode "w:i" [] () | tokFeature tokenItalic toktype ] ++ @@ -656,10 +569,10 @@ styleToOpenXml sm style = ] tokStyles = tokenStyles style tokFeature f toktype = maybe False f $ M.lookup toktype tokStyles - tokCol toktype = maybe "auto" (drop 1 . fromColor) + tokCol toktype = maybe "auto" (T.pack . drop 1 . fromColor) $ (tokenColor =<< M.lookup toktype tokStyles) `mplus` defaultColor style - tokBg toktype = maybe "auto" (drop 1 . fromColor) + tokBg toktype = maybe "auto" (T.pack . drop 1 . fromColor) $ (tokenBackground =<< M.lookup toktype tokStyles) `mplus` backgroundColor style parStyle | hasStyleName "Source Code" (smParaStyle sm) = Nothing @@ -672,23 +585,25 @@ styleToOpenXml sm style = , mknode "w:pPr" [] $ mknode "w:wordWrap" [("w:val","off")] () : - maybe [] (\col -> [mknode "w:shd" [("w:val","clear"),("w:fill",drop 1 $ fromColor col)] ()]) (backgroundColor style) + maybe [] (\col -> [mknode "w:shd" [("w:val","clear"),("w:fill", T.pack $ drop 1 $ fromColor col)] ()]) (backgroundColor style) ] -copyChildren :: (PandocMonad m) => Archive -> Archive -> String -> Integer -> [String] -> m Entry +copyChildren :: (PandocMonad m) + => Archive -> Archive -> String -> Integer -> [Text] -> m Entry copyChildren refArchive distArchive path timestamp elNames = do ref <- parseXml refArchive distArchive path dist <- parseXml distArchive distArchive path + let elsToCopy = + map cleanElem $ filterChildrenName (\e -> qName e `elem` elNames) ref + let elsToKeep = + [e | Elem e <- elContent dist, not (any (hasSameNameAs e) elsToCopy)] return $ toEntry path timestamp $ renderXml dist{ - elContent = elContent dist ++ copyContent ref + elContent = map Elem elsToKeep ++ map Elem elsToCopy } where - strName QName{qName=name, qPrefix=prefix} - | Just p <- prefix = p++":"++name - | otherwise = name - shouldCopy = (`elem` elNames) . strName - cleanElem el@Element{elName=name} = Elem el{elName=name{qURI=Nothing}} - copyContent = map cleanElem . filterChildrenName shouldCopy + hasSameNameAs (Element {elName = n1}) (Element {elName = n2}) = + qName n1 == qName n2 + cleanElem el@Element{elName=name} = el{elName=name{qURI=Nothing}} -- this is the lowest number used for a list numId baseListId :: Int @@ -697,43 +612,42 @@ baseListId = 1000 mkNumbering :: [ListMarker] -> [Element] mkNumbering lists = elts ++ zipWith mkNum lists [baseListId..(baseListId + length lists - 1)] - where elts = zipWith mkAbstractNum (ordNub lists) $ - randomRs (0x10000000, 0xFFFFFFFF) $ mkStdGen 1848 + where elts = map mkAbstractNum (ordNub lists) maxListLevel :: Int maxListLevel = 8 mkNum :: ListMarker -> Int -> Element mkNum marker numid = - mknode "w:num" [("w:numId",show numid)] + mknode "w:num" [("w:numId",tshow numid)] $ mknode "w:abstractNumId" [("w:val",listMarkerToId marker)] () : case marker of NoMarker -> [] BulletMarker -> [] NumberMarker _ _ start -> - map (\lvl -> mknode "w:lvlOverride" [("w:ilvl",show (lvl :: Int))] - $ mknode "w:startOverride" [("w:val",show start)] ()) + map (\lvl -> mknode "w:lvlOverride" [("w:ilvl",tshow (lvl :: Int))] + $ mknode "w:startOverride" [("w:val",tshow start)] ()) [0..maxListLevel] -mkAbstractNum :: ListMarker -> Integer -> Element -mkAbstractNum marker nsid = +mkAbstractNum :: ListMarker -> Element +mkAbstractNum marker = mknode "w:abstractNum" [("w:abstractNumId",listMarkerToId marker)] - $ mknode "w:nsid" [("w:val", printf "%8x" nsid)] () + $ mknode "w:nsid" [("w:val", "A" <> listMarkerToId marker)] () : mknode "w:multiLevelType" [("w:val","multilevel")] () : map (mkLvl marker) [0..maxListLevel] mkLvl :: ListMarker -> Int -> Element mkLvl marker lvl = - mknode "w:lvl" [("w:ilvl",show lvl)] $ + mknode "w:lvl" [("w:ilvl",tshow lvl)] $ [ mknode "w:start" [("w:val",start)] () | marker /= NoMarker && marker /= BulletMarker ] ++ [ mknode "w:numFmt" [("w:val",fmt)] () - , mknode "w:lvlText" [("w:val",lvltxt)] () + , mknode "w:lvlText" [("w:val", lvltxt)] () , mknode "w:lvlJc" [("w:val","left")] () , mknode "w:pPr" [] - [ mknode "w:ind" [ ("w:left",show $ lvl * step + step) - , ("w:hanging",show (hang :: Int)) + [ mknode "w:ind" [ ("w:left",tshow $ lvl * step + step) + , ("w:hanging",tshow (hang :: Int)) ] () ] ] @@ -742,8 +656,8 @@ mkLvl marker lvl = NoMarker -> ("bullet"," ","1") BulletMarker -> ("bullet",bulletFor lvl,"1") NumberMarker st de n -> (styleFor st lvl - ,patternFor de ("%" ++ show (lvl + 1)) - ,show n) + ,patternFor de ("%" <> tshow (lvl + 1)) + ,tshow n) step = 720 hang = 480 bulletFor 0 = "\x2022" -- filled circle @@ -766,9 +680,9 @@ mkLvl marker lvl = styleFor DefaultStyle 5 = "lowerRoman" styleFor DefaultStyle x = styleFor DefaultStyle (x `mod` 6) styleFor _ _ = "decimal" - patternFor OneParen s = s ++ ")" - patternFor TwoParens s = "(" ++ s ++ ")" - patternFor _ s = s ++ "." + patternFor OneParen s = s <> ")" + patternFor TwoParens s = "(" <> s <> ")" + patternFor _ s = s <> "." getNumId :: (PandocMonad m) => WS m Int getNumId = (((baseListId - 1) +) . length) `fmap` gets stLists @@ -776,8 +690,8 @@ getNumId = (((baseListId - 1) +) . length) `fmap` gets stLists makeTOC :: (PandocMonad m) => WriterOptions -> WS m [Element] makeTOC opts = do - let depth = "1-"++show (writerTOCDepth opts) - let tocCmd = "TOC \\o \""++depth++"\" \\h \\z \\u" + let depth = "1-" <> tshow (writerTOCDepth opts) + let tocCmd = "TOC \\o \"" <> depth <> "\" \\h \\z \\u" tocTitle <- gets stTocTitle title <- withParaPropM (pStyleM "TOC Heading") (blocksToOpenXML opts [Para tocTitle]) return @@ -788,7 +702,7 @@ makeTOC opts = do mknode "w:docPartUnique" [] ()] -- w:docPartObj ), -- w:sdtPr - mknode "w:sdtContent" [] (title++[ + mknode "w:sdtContent" [] (title ++ [ Elem $ mknode "w:p" [] ( mknode "w:r" [] [ mknode "w:fldChar" [("w:fldCharType","begin"),("w:dirty","true")] (), @@ -802,7 +716,9 @@ makeTOC opts = do -- | Convert Pandoc document to two lists of -- OpenXML elements (the main document and footnotes). -writeOpenXML :: (PandocMonad m) => WriterOptions -> Pandoc -> WS m ([Element], [Element],[Element]) +writeOpenXML :: (PandocMonad m) + => WriterOptions -> Pandoc + -> WS m ([Content], [Element], [Element]) writeOpenXML opts (Pandoc meta blocks) = do let tit = docTitle meta let auths = docAuthors meta @@ -828,8 +744,9 @@ writeOpenXML opts (Pandoc meta blocks) = do let toComment (kvs, ils) = do annotation <- inlinesToOpenXML opts ils return $ - mknode "w:comment" [('w':':':T.unpack k,T.unpack v) | (k,v) <- kvs] + mknode "w:comment" [("w:" <> k, v) | (k,v) <- kvs] [ mknode "w:p" [] $ + map Elem [ mknode "w:pPr" [] [ mknode "w:pStyle" [("w:val", "CommentText")] () ] , mknode "w:r" [] @@ -844,11 +761,11 @@ writeOpenXML opts (Pandoc meta blocks) = do toc <- if includeTOC then makeTOC opts else return [] - let meta' = title ++ subtitle ++ authors ++ date ++ abstract ++ toc + let meta' = title ++ subtitle ++ authors ++ date ++ abstract ++ map Elem toc return (meta' ++ doc', notes', comments') -- | Convert a list of Pandoc blocks to OpenXML. -blocksToOpenXML :: (PandocMonad m) => WriterOptions -> [Block] -> WS m [Element] +blocksToOpenXML :: (PandocMonad m) => WriterOptions -> [Block] -> WS m [Content] blocksToOpenXML opts = fmap concat . mapM (blockToOpenXML opts) . separateTables -- Word combines adjacent tables unless you put an empty paragraph between @@ -859,35 +776,29 @@ separateTables (x@Table{}:xs@(Table{}:_)) = x : RawBlock (Format "openxml") "<w:p />" : separateTables xs separateTables (x:xs) = x : separateTables xs -pStyleM :: (PandocMonad m) => ParaStyleName -> WS m XML.Element -pStyleM styleName = do - pStyleMap <- gets (smParaStyle . stStyleMaps) - let sty' = getStyleIdFromName styleName pStyleMap - return $ mknode "w:pStyle" [("w:val", T.unpack $ fromStyleId sty')] () - rStyleM :: (PandocMonad m) => CharStyleName -> WS m XML.Element rStyleM styleName = do cStyleMap <- gets (smCharStyle . stStyleMaps) let sty' = getStyleIdFromName styleName cStyleMap - return $ mknode "w:rStyle" [("w:val", T.unpack $ fromStyleId sty')] () + return $ mknode "w:rStyle" [("w:val", fromStyleId sty')] () -getUniqueId :: (PandocMonad m) => WS m String +getUniqueId :: (PandocMonad m) => WS m Text -- the + 20 is to ensure that there are no clashes with the rIds -- already in word/document.xml.rel getUniqueId = do n <- gets stCurId modify $ \st -> st{stCurId = n + 1} - return $ show n + return $ tshow n -- | Key for specifying user-defined docx styles. -dynamicStyleKey :: T.Text +dynamicStyleKey :: Text dynamicStyleKey = "custom-style" -- | Convert a Pandoc block element to OpenXML. -blockToOpenXML :: (PandocMonad m) => WriterOptions -> Block -> WS m [Element] +blockToOpenXML :: (PandocMonad m) => WriterOptions -> Block -> WS m [Content] blockToOpenXML opts blk = withDirection $ blockToOpenXML' opts blk -blockToOpenXML' :: (PandocMonad m) => WriterOptions -> Block -> WS m [Element] +blockToOpenXML' :: (PandocMonad m) => WriterOptions -> Block -> WS m [Content] blockToOpenXML' _ Null = return [] blockToOpenXML' opts (Div (ident,_classes,kvs) bs) = do stylemod <- case lookup dynamicStyleKey kvs of @@ -921,18 +832,18 @@ blockToOpenXML' opts (Header lev (ident,_,kvs) lst) = do Just n -> do num <- withTextPropM (rStyleM "SectionNumber") (inlineToOpenXML opts (Str n)) - return $ num ++ [mknode "w:r" [] [mknode "w:tab" [] ()]] + return $ num ++ [Elem $ mknode "w:r" [] [mknode "w:tab" [] ()]] Nothing -> return [] else return [] contents <- (number ++) <$> inlinesToOpenXML opts lst if T.null ident - then return [mknode "w:p" [] (paraProps ++ contents)] + then return [Elem $ mknode "w:p" [] (map Elem paraProps ++ contents)] else do let bookmarkName = ident modify $ \s -> s{ stSectionIds = Set.insert bookmarkName $ stSectionIds s } bookmarkedContents <- wrapBookmark bookmarkName contents - return [mknode "w:p" [] (paraProps ++ bookmarkedContents)] + return [Elem $ mknode "w:p" [] (map Elem paraProps ++ bookmarkedContents)] blockToOpenXML' opts (Plain lst) = do isInTable <- gets stInTable isInList <- gets stInList @@ -944,15 +855,31 @@ blockToOpenXML' opts (Plain lst) = do -- title beginning with fig: indicates that the image is a figure blockToOpenXML' opts (Para [Image attr alt (src,T.stripPrefix "fig:" -> Just tit)]) = do setFirstPara + fignum <- gets stNextFigureNum + unless (null alt) $ modify $ \st -> st{ stNextFigureNum = fignum + 1 } + let figid = "fig" <> tshow fignum + figname <- translateTerm Term.Figure prop <- pStyleM $ if null alt then "Figure" else "Captioned Figure" paraProps <- local (\env -> env { envParaProperties = EnvProps (Just prop) [] <> envParaProperties env }) (getParaProps False) contents <- inlinesToOpenXML opts [Image attr alt (src,tit)] - captionNode <- withParaPropM (pStyleM "Image Caption") - $ blockToOpenXML opts (Para alt) - return $ mknode "w:p" [] (paraProps ++ contents) : captionNode + captionNode <- if null alt + then return [] + else withParaPropM (pStyleM "Image Caption") + $ blockToOpenXML opts + (Para $ Span (figid,[],[]) + [Str (figname <> "\160"), + RawInline (Format "openxml") + ("<w:fldSimple w:instr=\"SEQ Figure" + <> " \\* ARABIC \"><w:r><w:t>" + <> tshow fignum + <> "</w:t></w:r></w:fldSimple>"), + Str ":", Space] : alt) + return $ + Elem (mknode "w:p" [] (map Elem paraProps ++ contents)) + : captionNode blockToOpenXML' opts (Para lst) | null lst && not (isEnabled Ext_empty_paragraphs opts) = return [] | otherwise = do @@ -969,10 +896,12 @@ blockToOpenXML' opts (Para lst) ps -> ps modify $ \s -> s { stFirstPara = False } contents <- inlinesToOpenXML opts lst - return [mknode "w:p" [] (paraProps' ++ contents)] + return [Elem $ mknode "w:p" [] (map Elem paraProps' ++ contents)] blockToOpenXML' opts (LineBlock lns) = blockToOpenXML opts $ linesToPara lns blockToOpenXML' _ b@(RawBlock format str) - | format == Format "openxml" = return [ x | Elem x <- parseXML str ] + | format == Format "openxml" = return [ + Text (CData CDataRaw str Nothing) + ] | otherwise = do report $ BlockNotRendered b return [] @@ -987,73 +916,14 @@ blockToOpenXML' opts (CodeBlock attrs@(ident, _, _) str) = do wrapBookmark ident p blockToOpenXML' _ HorizontalRule = do setFirstPara - return [ + return [ Elem $ mknode "w:p" [] $ mknode "w:r" [] $ mknode "w:pict" [] $ mknode "v:rect" [("style","width:0;height:1.5pt"), ("o:hralign","center"), ("o:hrstd","t"),("o:hr","t")] () ] -blockToOpenXML' opts (Table _ blkCapt specs thead tbody tfoot) = do - let (caption, aligns, widths, headers, rows) = toLegacyTable blkCapt specs thead tbody tfoot - setFirstPara - modify $ \s -> s { stInTable = True } - let captionStr = stringify caption - caption' <- if null caption - then return [] - else withParaPropM (pStyleM "Table Caption") - $ blockToOpenXML opts (Para caption) - let alignmentFor al = mknode "w:jc" [("w:val",alignmentToString al)] () - -- Table cells require a <w:p> element, even an empty one! - -- Not in the spec but in Word 2007, 2010. See #4953. - let cellToOpenXML (al, cell) = do - es <- withParaProp (alignmentFor al) $ blocksToOpenXML opts cell - return $ if any (\e -> qName (elName e) == "p") es - then es - else es ++ [mknode "w:p" [] ()] - headers' <- mapM cellToOpenXML $ zip aligns headers - rows' <- mapM (mapM cellToOpenXML . zip aligns) rows - let borderProps = mknode "w:tcPr" [] - [ mknode "w:tcBorders" [] - $ mknode "w:bottom" [("w:val","single")] () - , mknode "w:vAlign" [("w:val","bottom")] () ] - compactStyle <- pStyleM "Compact" - let emptyCell' = [mknode "w:p" [] [mknode "w:pPr" [] [compactStyle]]] - let mkcell border contents = mknode "w:tc" [] - $ [ borderProps | border ] ++ - if null contents - then emptyCell' - else contents - let mkrow border cells = mknode "w:tr" [] $ - [mknode "w:trPr" [] [ - mknode "w:cnfStyle" [("w:firstRow","1")] ()] | border] - ++ map (mkcell border) cells - let textwidth = 7920 -- 5.5 in in twips, 1/20 pt - let fullrow = 5000 -- 100% specified in pct - let rowwidth = fullrow * sum widths - let mkgridcol w = mknode "w:gridCol" - [("w:w", show (floor (textwidth * w) :: Integer))] () - let hasHeader = not $ all null headers - modify $ \s -> s { stInTable = False } - return $ - caption' ++ - [mknode "w:tbl" [] - ( mknode "w:tblPr" [] - ( mknode "w:tblStyle" [("w:val","Table")] () : - mknode "w:tblW" [("w:type", "pct"), ("w:w", show rowwidth)] () : - mknode "w:tblLook" [("w:firstRow",if hasHeader then "1" else "0") - ,("w:lastRow","0") - ,("w:firstColumn","0") - ,("w:lastColumn","0") - ,("w:noHBand","0") - ,("w:noVBand","0")] () : - [ mknode "w:tblCaption" [("w:val", T.unpack captionStr)] () - | not (null caption) ] ) - : mknode "w:tblGrid" [] - (if all (==0) widths - then [] - else map mkgridcol widths) - : [ mkrow True headers' | hasHeader ] ++ - map (mkrow False) rows' - )] +blockToOpenXML' opts (Table attr caption colspecs thead tbodies tfoot) = + tableToOpenXML (blocksToOpenXML opts) + (Grid.toTable attr caption colspecs thead tbodies tfoot) blockToOpenXML' opts el | BulletList lst <- el = addOpenXMLList BulletMarker lst | OrderedList (start, numstyle, numdelim) lst <- el @@ -1070,7 +940,9 @@ blockToOpenXML' opts (DefinitionList items) = do setFirstPara return l -definitionListItemToOpenXML :: (PandocMonad m) => WriterOptions -> ([Inline],[[Block]]) -> WS m [Element] +definitionListItemToOpenXML :: (PandocMonad m) + => WriterOptions -> ([Inline],[[Block]]) + -> WS m [Content] definitionListItemToOpenXML opts (term,defs) = do term' <- withParaPropM (pStyleM "Definition Term") $ blockToOpenXML opts (Para term) @@ -1083,8 +955,11 @@ addList marker = do lists <- gets stLists modify $ \st -> st{ stLists = lists ++ [marker] } -listItemToOpenXML :: (PandocMonad m) => WriterOptions -> Int -> [Block] -> WS m [Element] -listItemToOpenXML _ _ [] = return [] +listItemToOpenXML :: (PandocMonad m) + => WriterOptions + -> Int -> [Block] + -> WS m [Content] +listItemToOpenXML _ _ [] = return [] listItemToOpenXML opts numid (first:rest) = do oldInList <- gets stInList modify $ \st -> st{ stInList = True } @@ -1103,15 +978,8 @@ listItemToOpenXML opts numid (first:rest) = do modify $ \st -> st{ stInList = oldInList } return $ first'' ++ rest'' -alignmentToString :: Alignment -> [Char] -alignmentToString alignment = case alignment of - AlignLeft -> "left" - AlignRight -> "right" - AlignCenter -> "center" - AlignDefault -> "left" - -- | Convert a list of inline elements to OpenXML. -inlinesToOpenXML :: (PandocMonad m) => WriterOptions -> [Inline] -> WS m [Element] +inlinesToOpenXML :: PandocMonad m => WriterOptions -> [Inline] -> WS m [Content] inlinesToOpenXML opts lst = concat `fmap` mapM (inlineToOpenXML opts) lst withNumId :: (PandocMonad m) => Int -> WS m a -> WS m a @@ -1120,10 +988,6 @@ withNumId numid = local $ \env -> env{ envListNumId = numid } asList :: (PandocMonad m) => WS m a -> WS m a asList = local $ \env -> env{ envListLevel = envListLevel env + 1 } -isStyle :: Element -> Bool -isStyle e = isElem [] "w" "rStyle" e || - isElem [] "w" "pStyle" e - getTextProps :: (PandocMonad m) => WS m [Element] getTextProps = do props <- asks envTextProperties @@ -1146,23 +1010,13 @@ getParaProps displayMathPara = do listLevel <- asks envListLevel numid <- asks envListNumId let listPr = [mknode "w:numPr" [] - [ mknode "w:ilvl" [("w:val",show listLevel)] () - , mknode "w:numId" [("w:val",show numid)] () ] | listLevel >= 0 && not displayMathPara] + [ mknode "w:ilvl" [("w:val",tshow listLevel)] () + , mknode "w:numId" [("w:val",tshow numid)] () ] | listLevel >= 0 && not displayMathPara] return $ case listPr ++ squashProps props of [] -> [] ps -> [mknode "w:pPr" [] ps] -withParaProp :: PandocMonad m => Element -> WS m a -> WS m a -withParaProp d p = - local (\env -> env {envParaProperties = ep <> envParaProperties env}) p - where ep = if isStyle d then EnvProps (Just d) [] else EnvProps Nothing [d] - -withParaPropM :: PandocMonad m => WS m Element -> WS m a -> WS m a -withParaPropM md p = do - d <- md - withParaProp d p - -formattedString :: PandocMonad m => T.Text -> WS m [Element] +formattedString :: PandocMonad m => Text -> WS m [Element] formattedString str = -- properly handle soft hyphens case splitTextBy (=='\173') str of @@ -1171,7 +1025,7 @@ formattedString str = sh <- formattedRun [mknode "w:softHyphen" [] ()] intercalate sh <$> mapM formattedString' ws -formattedString' :: PandocMonad m => T.Text -> WS m [Element] +formattedString' :: PandocMonad m => Text -> WS m [Element] formattedString' str = do inDel <- asks envInDel formattedRun [ mktnode (if inDel then "w:delText" else "w:t") @@ -1182,16 +1036,13 @@ formattedRun els = do props <- getTextProps return [ mknode "w:r" [] $ props ++ els ] -setFirstPara :: PandocMonad m => WS m () -setFirstPara = modify $ \s -> s { stFirstPara = True } - -- | Convert an inline element to OpenXML. -inlineToOpenXML :: PandocMonad m => WriterOptions -> Inline -> WS m [Element] +inlineToOpenXML :: PandocMonad m => WriterOptions -> Inline -> WS m [Content] inlineToOpenXML opts il = withDirection $ inlineToOpenXML' opts il -inlineToOpenXML' :: PandocMonad m => WriterOptions -> Inline -> WS m [Element] +inlineToOpenXML' :: PandocMonad m => WriterOptions -> Inline -> WS m [Content] inlineToOpenXML' _ (Str str) = - formattedString str + map Elem <$> formattedString str inlineToOpenXML' opts Space = inlineToOpenXML opts (Str " ") inlineToOpenXML' opts SoftBreak = inlineToOpenXML opts (Str " ") inlineToOpenXML' opts (Span ("",["csl-block"],[]) ils) = @@ -1199,10 +1050,11 @@ inlineToOpenXML' opts (Span ("",["csl-block"],[]) ils) = inlineToOpenXML' opts (Span ("",["csl-left-margin"],[]) ils) = inlinesToOpenXML opts ils inlineToOpenXML' opts (Span ("",["csl-right-inline"],[]) ils) = - ([mknode "w:r" [] - (mknode "w:t" - [("xml:space","preserve")] - ("\t" :: String))] ++) + ([Elem $ + mknode "w:r" [] + (mknode "w:t" + [("xml:space","preserve")] + ("\t" :: Text))] ++) <$> inlinesToOpenXML opts ils inlineToOpenXML' opts (Span ("",["csl-indent"],[]) ils) = inlinesToOpenXML opts ils @@ -1212,18 +1064,18 @@ inlineToOpenXML' _ (Span (ident,["comment-start"],kvs) ils) = do let ident' = fromMaybe ident (lookup "id" kvs) kvs' = filter (("id" /=) . fst) kvs modify $ \st -> st{ stComments = (("id",ident'):kvs', ils) : stComments st } - return [ mknode "w:commentRangeStart" [("w:id", T.unpack ident')] () ] + return [ Elem $ mknode "w:commentRangeStart" [("w:id", ident')] () ] inlineToOpenXML' _ (Span (ident,["comment-end"],kvs) _) = -- prefer the "id" in kvs, since that is the one produced by the docx -- reader. let ident' = fromMaybe ident (lookup "id" kvs) - in - return [ mknode "w:commentRangeEnd" [("w:id", T.unpack ident')] () - , mknode "w:r" [] - [ mknode "w:rPr" [] - [ mknode "w:rStyle" [("w:val", "CommentReference")] () ] - , mknode "w:commentReference" [("w:id", T.unpack ident')] () ] - ] + in return . map Elem $ + [ mknode "w:commentRangeEnd" [("w:id", ident')] () + , mknode "w:r" [] + [ mknode "w:rPr" [] + [ mknode "w:rStyle" [("w:val", "CommentReference")] () ] + , mknode "w:commentReference" [("w:id", ident')] () ] + ] inlineToOpenXML' opts (Span (ident,classes,kvs) ils) = do stylemod <- case lookup dynamicStyleKey kvs of Just (fromString . T.unpack -> sty) -> do @@ -1246,8 +1098,8 @@ inlineToOpenXML' opts (Span (ident,classes,kvs) ils) = do defaultAuthor <- asks envChangesAuthor let author = fromMaybe defaultAuthor (lookup "author" kvs) let mdate = lookup "date" kvs - return $ ("w:author", T.unpack author) : - maybe [] (\date -> [("w:date", T.unpack date)]) mdate + return $ ("w:author", author) : + maybe [] (\date -> [("w:date", date)]) mdate insmod <- if "insertion" `elem` classes then do changeAuthorDate <- getChangeAuthorDate @@ -1255,8 +1107,9 @@ inlineToOpenXML' opts (Span (ident,classes,kvs) ils) = do modify $ \s -> s{stInsId = insId + 1} return $ \f -> do x <- f - return [ mknode "w:ins" - (("w:id", show insId) : changeAuthorDate) x] + return [Elem $ + mknode "w:ins" + (("w:id", tshow insId) : changeAuthorDate) x] else return id delmod <- if "deletion" `elem` classes then do @@ -1265,16 +1118,20 @@ inlineToOpenXML' opts (Span (ident,classes,kvs) ils) = do modify $ \s -> s{stDelId = delId + 1} return $ \f -> local (\env->env{envInDel=True}) $ do x <- f - return [mknode "w:del" - (("w:id", show delId) : changeAuthorDate) x] + return [Elem $ mknode "w:del" + (("w:id", tshow delId) : changeAuthorDate) x] else return id contents <- insmod $ delmod $ dirmod $ stylemod $ pmod $ inlinesToOpenXML opts ils wrapBookmark ident contents inlineToOpenXML' opts (Strong lst) = - withTextProp (mknode "w:b" [] ()) $ inlinesToOpenXML opts lst + withTextProp (mknode "w:b" [] ()) $ + withTextProp (mknode "w:bCs" [] ()) $ -- needed for LTR, #6911 + inlinesToOpenXML opts lst inlineToOpenXML' opts (Emph lst) = - withTextProp (mknode "w:i" [] ()) $ inlinesToOpenXML opts lst + withTextProp (mknode "w:i" [] ()) $ + withTextProp (mknode "w:iCs" [] ()) $ -- needed for LTR, #6911 + inlinesToOpenXML opts lst inlineToOpenXML' opts (Underline lst) = withTextProp (mknode "w:u" [("w:val","single")] ()) $ inlinesToOpenXML opts lst @@ -1290,9 +1147,10 @@ inlineToOpenXML' opts (SmallCaps lst) = inlineToOpenXML' opts (Strikeout lst) = withTextProp (mknode "w:strike" [] ()) $ inlinesToOpenXML opts lst -inlineToOpenXML' _ LineBreak = return [br] +inlineToOpenXML' _ LineBreak = return [Elem br] inlineToOpenXML' _ il@(RawInline f str) - | f == Format "openxml" = return [ x | Elem x <- parseXML str ] + | f == Format "openxml" = return + [Text (CData CDataRaw str Nothing)] | otherwise = do report $ InlineNotRendered il return [] @@ -1305,26 +1163,26 @@ inlineToOpenXML' opts (Math mathType str) = do when (mathType == DisplayMath) setFirstPara res <- (lift . lift) (convertMath writeOMML mathType str) case res of - Right r -> return [r] + Right r -> return [Elem $ fromXLElement r] Left il -> inlineToOpenXML' opts il inlineToOpenXML' opts (Cite _ lst) = inlinesToOpenXML opts lst inlineToOpenXML' opts (Code attrs str) = do let alltoktypes = [KeywordTok ..] tokTypesMap <- mapM (\tt -> (,) tt <$> rStyleM (fromString $ show tt)) alltoktypes - let unhighlighted = intercalate [br] `fmap` + let unhighlighted = (map Elem . intercalate [br]) `fmap` mapM formattedString (T.lines str) formatOpenXML _fmtOpts = intercalate [br] . map (map toHlTok) toHlTok (toktype,tok) = mknode "w:r" [] [ mknode "w:rPr" [] $ maybeToList (lookup toktype tokTypesMap) - , mknode "w:t" [("xml:space","preserve")] (T.unpack tok) ] + , mknode "w:t" [("xml:space","preserve")] tok ] withTextPropM (rStyleM "Verbatim Char") $ if isNothing (writerHighlightStyle opts) then unhighlighted else case highlight (writerSyntaxMap opts) formatOpenXML attrs str of - Right h -> return h + Right h -> return (map Elem h) Left msg -> do unless (T.null msg) $ report $ CouldNotHighlight msg unhighlighted @@ -1335,7 +1193,7 @@ inlineToOpenXML' opts (Note bs) = do let notemarker = mknode "w:r" [] [ mknode "w:rPr" [] footnoteStyle , mknode "w:footnoteRef" [] () ] - let notemarkerXml = RawInline (Format "openxml") $ T.pack $ ppElement notemarker + let notemarkerXml = RawInline (Format "openxml") $ ppElement notemarker let insertNoteRef (Plain ils : xs) = Plain (notemarkerXml : Space : ils) : xs insertNoteRef (Para ils : xs) = Para (notemarkerXml : Space : ils) : xs insertNoteRef xs = Para [notemarkerXml] : xs @@ -1347,26 +1205,26 @@ inlineToOpenXML' opts (Note bs) = do $ insertNoteRef bs) let newnote = mknode "w:footnote" [("w:id", notenum)] contents modify $ \s -> s{ stFootnotes = newnote : notes } - return [ mknode "w:r" [] + return [ Elem $ mknode "w:r" [] [ mknode "w:rPr" [] footnoteStyle , mknode "w:footnoteReference" [("w:id", notenum)] () ] ] -- internal link: inlineToOpenXML' opts (Link _ txt (T.uncons -> Just ('#', xs),_)) = do contents <- withTextPropM (rStyleM "Hyperlink") $ inlinesToOpenXML opts txt return - [ mknode "w:hyperlink" [("w:anchor", T.unpack $ toBookmarkName xs)] contents ] + [ Elem $ mknode "w:hyperlink" [("w:anchor", toBookmarkName xs)] contents ] -- external link: inlineToOpenXML' opts (Link _ txt (src,_)) = do contents <- withTextPropM (rStyleM "Hyperlink") $ inlinesToOpenXML opts txt extlinks <- gets stExternalLinks - id' <- case M.lookup (T.unpack src) extlinks of + id' <- case M.lookup src extlinks of Just i -> return i Nothing -> do - i <- ("rId"++) `fmap` getUniqueId + i <- ("rId" <>) <$> getUniqueId modify $ \st -> st{ stExternalLinks = - M.insert (T.unpack src) i extlinks } + M.insert src i extlinks } return i - return [ mknode "w:hyperlink" [("r:id",id')] contents ] + return [ Elem $ mknode "w:hyperlink" [("r:id",id')] contents ] inlineToOpenXML' opts (Image attr@(imgident, _, _) alt (src, title)) = do pageWidth <- asks envPrintWidth imgs <- gets stImages @@ -1384,17 +1242,17 @@ inlineToOpenXML' opts (Image attr@(imgident, _, _) alt (src, title)) = do ,("noChangeAspect","1")] () nvPicPr = mknode "pic:nvPicPr" [] [ mknode "pic:cNvPr" - [("descr",T.unpack src),("id","0"),("name","Picture")] () + [("descr",src),("id","0"),("name","Picture")] () , cNvPicPr ] blipFill = mknode "pic:blipFill" [] - [ mknode "a:blip" [("r:embed",ident)] () + [ mknode "a:blip" [("r:embed",T.pack ident)] () , mknode "a:stretch" [] $ mknode "a:fillRect" [] () ] xfrm = mknode "a:xfrm" [] [ mknode "a:off" [("x","0"),("y","0")] () - , mknode "a:ext" [("cx",show xemu) - ,("cy",show yemu)] () ] + , mknode "a:ext" [("cx",tshow xemu) + ,("cy",tshow yemu)] () ] prstGeom = mknode "a:prstGeom" [("prst","rect")] $ mknode "a:avLst" [] () ln = mknode "a:ln" [("w","9525")] @@ -1415,12 +1273,12 @@ inlineToOpenXML' opts (Image attr@(imgident, _, _) alt (src, title)) = do imgElt = mknode "w:r" [] $ mknode "w:drawing" [] $ mknode "wp:inline" [] - [ mknode "wp:extent" [("cx",show xemu),("cy",show yemu)] () + [ mknode "wp:extent" [("cx",tshow xemu),("cy",tshow yemu)] () , mknode "wp:effectExtent" [("b","0"),("l","0"),("r","0"),("t","0")] () , mknode "wp:docPr" - [ ("descr", T.unpack $ stringify alt) - , ("title", T.unpack title) + [ ("descr", stringify alt) + , ("title", title) , ("id","1") , ("name","Picture") ] () @@ -1430,10 +1288,10 @@ inlineToOpenXML' opts (Image attr@(imgident, _, _) alt (src, title)) = do imgElt wrapBookmark imgident =<< case stImage of - Just imgData -> return [generateImgElt imgData] + Just imgData -> return [Elem $ generateImgElt imgData] Nothing -> ( do --try (img, mt) <- P.fetchItem src - ident <- ("rId"++) `fmap` getUniqueId + ident <- ("rId" <>) <$> getUniqueId let imgext = case mt >>= extensionFromMimeType of @@ -1446,11 +1304,12 @@ inlineToOpenXML' opts (Image attr@(imgident, _, _) alt (src, title)) = do Just Eps -> ".eps" Just Svg -> ".svg" Just Emf -> ".emf" + Just Tiff -> ".tiff" Nothing -> "" - imgpath = "media/" <> ident <> T.unpack imgext - mbMimeType = mt <|> getMimeType imgpath + imgpath = "media/" <> ident <> imgext + mbMimeType = mt <|> getMimeType (T.unpack imgpath) - imgData = (ident, imgpath, mbMimeType, img) + imgData = (T.unpack ident, T.unpack imgpath, mbMimeType, img) if T.null imgext then -- without an extension there is no rule for content type @@ -1458,7 +1317,7 @@ inlineToOpenXML' opts (Image attr@(imgident, _, _) alt (src, title)) = do else do -- insert mime type to use in constructing [Content_Types].xml modify $ \st -> st { stImages = M.insert (T.unpack src) imgData $ stImages st } - return [generateImgElt imgData] + return [Elem $ generateImgElt imgData] ) `catchError` ( \e -> do report $ CouldNotFetchResource src $ T.pack (show e) @@ -1469,22 +1328,6 @@ inlineToOpenXML' opts (Image attr@(imgident, _, _) alt (src, title)) = do br :: Element br = mknode "w:r" [] [mknode "w:br" [] ()] --- Word will insert these footnotes into the settings.xml file --- (whether or not they're visible in the document). If they're in the --- file, but not in the footnotes.xml file, it will produce --- problems. So we want to make sure we insert them into our document. -defaultFootnotes :: [Element] -defaultFootnotes = [ mknode "w:footnote" - [("w:type", "separator"), ("w:id", "-1")] - [ mknode "w:p" [] - [mknode "w:r" [] - [ mknode "w:separator" [] ()]]] - , mknode "w:footnote" - [("w:type", "continuationSeparator"), ("w:id", "0")] - [ mknode "w:p" [] - [ mknode "w:r" [] - [ mknode "w:continuationSeparator" [] ()]]]] - withDirection :: PandocMonad m => WS m a -> WS m a withDirection x = do @@ -1508,20 +1351,20 @@ withDirection x = do , envTextProperties = EnvProps textStyle textProps' } -wrapBookmark :: (PandocMonad m) => T.Text -> [Element] -> WS m [Element] +wrapBookmark :: (PandocMonad m) => Text -> [Content] -> WS m [Content] wrapBookmark "" contents = return contents wrapBookmark ident contents = do id' <- getUniqueId let bookmarkStart = mknode "w:bookmarkStart" [("w:id", id') - ,("w:name", T.unpack $ toBookmarkName ident)] () + ,("w:name", toBookmarkName ident)] () bookmarkEnd = mknode "w:bookmarkEnd" [("w:id", id')] () - return $ bookmarkStart : contents ++ [bookmarkEnd] + return $ Elem bookmarkStart : contents ++ [Elem bookmarkEnd] -- Word imposes a 40 character limit on bookmark names and requires -- that they begin with a letter. So we just use a hash of the -- identifier when otherwise we'd have an illegal bookmark name. -toBookmarkName :: T.Text -> T.Text +toBookmarkName :: Text -> Text toBookmarkName s | Just (c, _) <- T.uncons s , isLetter c diff --git a/src/Text/Pandoc/Writers/Docx/StyleMap.hs b/src/Text/Pandoc/Writers/Docx/StyleMap.hs index c3c54c7e5..04868eaad 100644 --- a/src/Text/Pandoc/Writers/Docx/StyleMap.hs +++ b/src/Text/Pandoc/Writers/Docx/StyleMap.hs @@ -2,7 +2,7 @@ {- | Module : Text.Pandoc.Writers.Docx.StyleMap Copyright : © 2014-2020 Jesse Rosenthal <jrosenthal@jhu.edu>, - 2014-2020 John MacFarlane <jgm@berkeley.edu>, + 2014-2021 John MacFarlane <jgm@berkeley.edu>, 2015-2019 Nikolay Yakimov <root@livid.pp.ru> License : GNU GPL, version 2 or above diff --git a/src/Text/Pandoc/Writers/Docx/Table.hs b/src/Text/Pandoc/Writers/Docx/Table.hs new file mode 100644 index 000000000..7a84c5278 --- /dev/null +++ b/src/Text/Pandoc/Writers/Docx/Table.hs @@ -0,0 +1,227 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{- | +Module : Text.Pandoc.Writers.Docx.Table +Copyright : Copyright (C) 2012-2021 John MacFarlane +License : GNU GPL, version 2 or above +Maintainer : John MacFarlane <jgm@berkeley.edu> + +Conversion of table blocks to docx. +-} +module Text.Pandoc.Writers.Docx.Table + ( tableToOpenXML + ) where + +import Control.Monad.State.Strict +import Data.Array +import Data.Text (Text) +import Text.Pandoc.Definition +import Text.Pandoc.Class.PandocMonad (PandocMonad, translateTerm) +import Text.Pandoc.Writers.Docx.Types +import Text.Pandoc.Shared +import Text.Printf (printf) +import Text.Pandoc.Writers.GridTable hiding (Table) +import Text.Pandoc.Writers.OOXML +import Text.Pandoc.XML.Light as XML hiding (Attr) +import qualified Data.Text as T +import qualified Text.Pandoc.Translations as Term +import qualified Text.Pandoc.Writers.GridTable as Grid + +tableToOpenXML :: PandocMonad m + => ([Block] -> WS m [Content]) + -> Grid.Table + -> WS m [Content] +tableToOpenXML blocksToOpenXML gridTable = do + setFirstPara + let (Grid.Table (ident,_,_) caption colspecs _rowheads thead tbodies tfoot) = + gridTable + let (Caption _maybeShortCaption captionBlocks) = caption + tablenum <- gets stNextTableNum + unless (null captionBlocks) $ + modify $ \st -> st{ stNextTableNum = tablenum + 1 } + let tableid = if T.null ident + then "table" <> tshow tablenum + else ident + tablename <- translateTerm Term.Table + let captionStr = stringify captionBlocks + let aligns = map fst $ elems colspecs + captionXml <- if null captionBlocks + then return [] + else withParaPropM (pStyleM "Table Caption") + $ blocksToOpenXML + $ addLabel tableid tablename tablenum captionBlocks + -- We set "in table" after processing the caption, because we don't + -- want the "Table Caption" style to be overwritten with "Compact". + modify $ \s -> s { stInTable = True } + head' <- cellGridToOpenXML blocksToOpenXML HeadRow aligns thead + bodies <- mapM (cellGridToOpenXML blocksToOpenXML BodyRow aligns) tbodies + foot' <- cellGridToOpenXML blocksToOpenXML FootRow aligns tfoot + + let hasHeader = not . null . indices . partRowAttrs $ thead + let hasFooter = not . null . indices . partRowAttrs $ tfoot + -- for compatibility with Word <= 2007, we include a val with a bitmask + -- 0×0020 Apply first row conditional formatting + -- 0×0040 Apply last row conditional formatting + -- 0×0080 Apply first column conditional formatting + -- 0×0100 Apply last column conditional formatting + -- 0×0200 Do not apply row banding conditional formatting + -- 0×0400 Do not apply column banding conditional formattin + let tblLookVal = if hasHeader then (0x20 :: Int) else 0 + let (gridCols, tblWattr) = tableLayout (elems colspecs) + let tbl = mknode "w:tbl" [] + ( mknode "w:tblPr" [] + ( mknode "w:tblStyle" [("w:val","Table")] () : + mknode "w:tblW" tblWattr () : + mknode "w:tblLook" [("w:firstRow",if hasHeader then "1" else "0") + ,("w:lastRow",if hasFooter then "1" else "0") + ,("w:firstColumn","0") + ,("w:lastColumn","0") + ,("w:noHBand","0") + ,("w:noVBand","0") + ,("w:val", T.pack $ printf "%04x" tblLookVal) + ] () : + [ mknode "w:tblCaption" [("w:val", captionStr)] () + | not (T.null captionStr) ] + ) + : mknode "w:tblGrid" [] gridCols + : head' ++ mconcat bodies ++ foot' + ) + modify $ \s -> s { stInTable = False } + return $ captionXml ++ [Elem tbl] + +addLabel :: Text -> Text -> Int -> [Block] -> [Block] +addLabel tableid tablename tablenum bs = + case bs of + (Para ils : rest) -> Para (label : Space : ils) : rest + (Plain ils : rest) -> Plain (label : Space : ils) : rest + _ -> Para [label] : bs + where + label = Span (tableid,[],[]) + [Str (tablename <> "\160"), + RawInline (Format "openxml") + ("<w:fldSimple w:instr=\"SEQ Table" + <> " \\* ARABIC \"><w:r><w:t>" + <> tshow tablenum + <> "</w:t></w:r></w:fldSimple>"), + Str ":"] + +-- | Parts of a table +data RowType = HeadRow | BodyRow | FootRow + +alignmentToString :: Alignment -> Text +alignmentToString = \case + AlignLeft -> "left" + AlignRight -> "right" + AlignCenter -> "center" + AlignDefault -> "left" + +tableLayout :: [ColSpec] -> ([Element], [(Text, Text)]) +tableLayout specs = + let + textwidth = 7920 -- 5.5 in in twips (1 twip == 1/20 pt) + fullrow = 5000 -- 100% specified in pct (1 pct == 1/50th of a percent) + ncols = length specs + getWidth = \case + ColWidth n -> n + _ -> 0 + widths = map (getWidth . snd) specs + rowwidth = round (fullrow * sum widths) :: Int + widthToTwips w = floor (textwidth * w) :: Int + mkGridCol w = mknode "w:gridCol" [("w:w", tshow (widthToTwips w))] () + in if all (== 0) widths + then ( replicate ncols $ mkGridCol (1.0 / fromIntegral ncols) + , [ ("w:type", "auto"), ("w:w", "0")]) + else ( map mkGridCol widths + , [ ("w:type", "pct"), ("w:w", tshow rowwidth) ]) + +cellGridToOpenXML :: PandocMonad m + => ([Block] -> WS m [Content]) + -> RowType + -> [Alignment] + -> Part + -> WS m [Element] +cellGridToOpenXML blocksToOpenXML rowType aligns part@(Part _ cellArray _) = + if null (elems cellArray) + then return mempty + else mapM (rowToOpenXML blocksToOpenXML) $ + partToRows rowType aligns part + +data OOXMLCell + = OOXMLCell Attr Alignment RowSpan ColSpan [Block] + | OOXMLCellMerge ColSpan + +data OOXMLRow = OOXMLRow RowType Attr [OOXMLCell] + +partToRows :: RowType -> [Alignment] -> Part -> [OOXMLRow] +partToRows rowType aligns part = + let + toOOXMLCell :: Alignment -> RowIndex -> ColIndex -> GridCell -> [OOXMLCell] + toOOXMLCell columnAlign ridx cidx = \case + ContentCell attr align rowspan colspan blocks -> + -- Respect non-default, cell specific alignment. + let align' = case align of + AlignDefault -> columnAlign + _ -> align + in [OOXMLCell attr align' rowspan colspan blocks] + ContinuationCell idx'@(ridx',cidx') | ridx /= ridx', cidx == cidx' -> + case (partCellArray part)!idx' of + (ContentCell _ _ _ colspan _) -> [OOXMLCellMerge colspan] + x -> error $ "Content cell expected, got, " ++ show x ++ + " at index " ++ show idx' + _ -> mempty + mkRow :: (RowIndex, Attr) -> OOXMLRow + mkRow (ridx, attr) = OOXMLRow rowType attr + . mconcat + . zipWith (\align -> uncurry $ toOOXMLCell align ridx) + aligns + . assocs + . rowArray ridx + $ partCellArray part + in map mkRow $ assocs (partRowAttrs part) + +rowToOpenXML :: PandocMonad m + => ([Block] -> WS m [Content]) + -> OOXMLRow + -> WS m Element +rowToOpenXML blocksToOpenXML (OOXMLRow rowType _attr cells) = do + xmlcells <- mapM (ooxmlCellToOpenXML blocksToOpenXML) cells + let addTrPr = case rowType of + HeadRow -> (mknode "w:trPr" [] + [mknode "w:tblHeader" [("w:val", "true")] ()] :) + BodyRow -> id + FootRow -> id + return $ mknode "w:tr" [] (addTrPr xmlcells) + +ooxmlCellToOpenXML :: PandocMonad m + => ([Block] -> WS m [Content]) + -> OOXMLCell + -> WS m Element +ooxmlCellToOpenXML blocksToOpenXML = \case + OOXMLCellMerge (ColSpan colspan) -> do + return $ mknode "w:tc" [] + [ mknode "w:tcPr" [] [ mknode "w:gridSpan" [("w:val", tshow colspan)] () + , mknode "w:vMerge" [("w:val", "continue")] () ] + , mknode "w:p" [] [mknode "w:pPr" [] ()]] + OOXMLCell _attr align rowspan (ColSpan colspan) contents -> do + compactStyle <- pStyleM "Compact" + es <- withParaProp (alignmentFor align) $ blocksToOpenXML contents + -- Table cells require a <w:p> element, even an empty one! + -- Not in the spec but in Word 2007, 2010. See #4953. And + -- apparently the last element must be a <w:p>, see #6983. + return . mknode "w:tc" [] $ + Elem + (mknode "w:tcPr" [] ([ mknode "w:gridSpan" [("w:val", tshow colspan)] () + | colspan > 1] ++ + [ mknode "w:vMerge" [("w:val", "restart")] () + | rowspan > RowSpan 1 ])) : + if null contents + then [Elem $ mknode "w:p" [] [mknode "w:pPr" [] [compactStyle]]] + else case reverse (onlyElems es) of + b:e:_ | qName (elName b) == "bookmarkEnd" -- y tho? + , qName (elName e) == "p" -> es + e:_ | qName (elName e) == "p" -> es + _ -> es ++ [Elem $ mknode "w:p" [] ()] + +alignmentFor :: Alignment -> Element +alignmentFor al = mknode "w:jc" [("w:val",alignmentToString al)] () diff --git a/src/Text/Pandoc/Writers/Docx/Types.hs b/src/Text/Pandoc/Writers/Docx/Types.hs new file mode 100644 index 000000000..74b8d2753 --- /dev/null +++ b/src/Text/Pandoc/Writers/Docx/Types.hs @@ -0,0 +1,185 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | +Module : Text.Pandoc.Writers.Docx +Copyright : Copyright (C) 2012-2021 John MacFarlane +License : GNU GPL, version 2 or above +Maintainer : John MacFarlane <jgm@berkeley.edu> + +Conversion of table blocks to docx. +-} +module Text.Pandoc.Writers.Docx.Types + ( EnvProps (..) + , WriterEnv (..) + , defaultWriterEnv + , WriterState (..) + , defaultWriterState + , WS + , ListMarker (..) + , listMarkerToId + , pStyleM + , isStyle + , setFirstPara + , withParaProp + , withParaPropM + ) where + +import Control.Applicative ((<|>)) +import Control.Monad.Reader +import Control.Monad.State.Strict +import Data.Text (Text) +import Text.Pandoc.Class.PandocMonad (PandocMonad) +import Text.Pandoc.Definition +import Text.Pandoc.MIME (MimeType) +import Text.Pandoc.Writers.Docx.StyleMap +import Text.Pandoc.Writers.OOXML +import Text.Pandoc.XML.Light as XML +import qualified Data.ByteString as B +import qualified Data.Map as M +import qualified Data.Set as Set +import qualified Data.Text as T + +data ListMarker = NoMarker + | BulletMarker + | NumberMarker ListNumberStyle ListNumberDelim Int + deriving (Show, Read, Eq, Ord) + +listMarkerToId :: ListMarker -> Text +listMarkerToId NoMarker = "990" +listMarkerToId BulletMarker = "991" +listMarkerToId (NumberMarker sty delim n) = T.pack $ + '9' : '9' : styNum : delimNum : show n + where styNum = case sty of + DefaultStyle -> '2' + Example -> '3' + Decimal -> '4' + LowerRoman -> '5' + UpperRoman -> '6' + LowerAlpha -> '7' + UpperAlpha -> '8' + delimNum = case delim of + DefaultDelim -> '0' + Period -> '1' + OneParen -> '2' + TwoParens -> '3' + + +data EnvProps = EnvProps{ styleElement :: Maybe Element + , otherElements :: [Element] + } + +instance Semigroup EnvProps where + EnvProps s es <> EnvProps s' es' = EnvProps (s <|> s') (es ++ es') + +instance Monoid EnvProps where + mempty = EnvProps Nothing [] + mappend = (<>) + +data WriterEnv = WriterEnv + { envTextProperties :: EnvProps + , envParaProperties :: EnvProps + , envRTL :: Bool + , envListLevel :: Int + , envListNumId :: Int + , envInDel :: Bool + , envChangesAuthor :: Text + , envChangesDate :: Text + , envPrintWidth :: Integer + } + +defaultWriterEnv :: WriterEnv +defaultWriterEnv = WriterEnv + { envTextProperties = mempty + , envParaProperties = mempty + , envRTL = False + , envListLevel = -1 + , envListNumId = 1 + , envInDel = False + , envChangesAuthor = "unknown" + , envChangesDate = "1969-12-31T19:00:00Z" + , envPrintWidth = 1 + } + + +data WriterState = WriterState{ + stFootnotes :: [Element] + , stComments :: [([(Text, Text)], [Inline])] + , stSectionIds :: Set.Set Text + , stExternalLinks :: M.Map Text Text + , stImages :: M.Map FilePath (String, String, Maybe MimeType, B.ByteString) + , stLists :: [ListMarker] + , stInsId :: Int + , stDelId :: Int + , stStyleMaps :: StyleMaps + , stFirstPara :: Bool + , stInTable :: Bool + , stInList :: Bool + , stTocTitle :: [Inline] + , stDynamicParaProps :: Set.Set ParaStyleName + , stDynamicTextProps :: Set.Set CharStyleName + , stCurId :: Int + , stNextFigureNum :: Int + , stNextTableNum :: Int + } + +defaultWriterState :: WriterState +defaultWriterState = WriterState{ + stFootnotes = defaultFootnotes + , stComments = [] + , stSectionIds = Set.empty + , stExternalLinks = M.empty + , stImages = M.empty + , stLists = [NoMarker] + , stInsId = 1 + , stDelId = 1 + , stStyleMaps = StyleMaps M.empty M.empty + , stFirstPara = False + , stInTable = False + , stInList = False + , stTocTitle = [Str "Table of Contents"] + , stDynamicParaProps = Set.empty + , stDynamicTextProps = Set.empty + , stCurId = 20 + , stNextFigureNum = 1 + , stNextTableNum = 1 + } + +setFirstPara :: PandocMonad m => WS m () +setFirstPara = modify $ \s -> s { stFirstPara = True } + +type WS m = ReaderT WriterEnv (StateT WriterState m) + +-- Word will insert these footnotes into the settings.xml file +-- (whether or not they're visible in the document). If they're in the +-- file, but not in the footnotes.xml file, it will produce +-- problems. So we want to make sure we insert them into our document. +defaultFootnotes :: [Element] +defaultFootnotes = [ mknode "w:footnote" + [("w:type", "separator"), ("w:id", "-1")] + [ mknode "w:p" [] + [mknode "w:r" [] + [ mknode "w:separator" [] ()]]] + , mknode "w:footnote" + [("w:type", "continuationSeparator"), ("w:id", "0")] + [ mknode "w:p" [] + [ mknode "w:r" [] + [ mknode "w:continuationSeparator" [] ()]]]] + +pStyleM :: (PandocMonad m) => ParaStyleName -> WS m XML.Element +pStyleM styleName = do + pStyleMap <- gets (smParaStyle . stStyleMaps) + let sty' = getStyleIdFromName styleName pStyleMap + return $ mknode "w:pStyle" [("w:val", fromStyleId sty')] () + +withParaProp :: PandocMonad m => Element -> WS m a -> WS m a +withParaProp d p = + local (\env -> env {envParaProperties = ep <> envParaProperties env}) p + where ep = if isStyle d then EnvProps (Just d) [] else EnvProps Nothing [d] + +withParaPropM :: PandocMonad m => WS m Element -> WS m a -> WS m a +withParaPropM md p = do + d <- md + withParaProp d p + +isStyle :: Element -> Bool +isStyle e = isElem [] "w" "rStyle" e || + isElem [] "w" "pStyle" e diff --git a/src/Text/Pandoc/Writers/DokuWiki.hs b/src/Text/Pandoc/Writers/DokuWiki.hs index 90ec6824f..602c70ebe 100644 --- a/src/Text/Pandoc/Writers/DokuWiki.hs +++ b/src/Text/Pandoc/Writers/DokuWiki.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Writers.DokuWiki - Copyright : Copyright (C) 2008-2020 John MacFarlane + Copyright : Copyright (C) 2008-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : Clare Macrae <clare.macrae@googlemail.com> @@ -27,6 +27,7 @@ import Control.Monad.Reader (ReaderT, asks, local, runReaderT) import Control.Monad.State.Strict (StateT, evalStateT) import Data.Default (Default (..)) import Data.List (transpose) +import Data.List.NonEmpty (nonEmpty) import Data.Text (Text) import qualified Data.Text as T import Text.Pandoc.Class.PandocMonad (PandocMonad, report) @@ -172,7 +173,8 @@ blockToDokuWiki opts (Table _ blkCapt specs thead tbody tfoot) = do then return [] else zipWithM (tableItemToDokuWiki opts) aligns headers rows' <- mapM (zipWithM (tableItemToDokuWiki opts) aligns) rows - let widths = map (maximum . map T.length) $ transpose (headers':rows') + let widths = map (maybe 0 maximum . nonEmpty . map T.length) + $ transpose (headers':rows') let padTo (width, al) s = case width - T.length s of x | x > 0 -> diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 12004889f..508fb6a98 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -5,7 +5,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Writers.EPUB - Copyright : Copyright (C) 2010-2020 John MacFarlane + Copyright : Copyright (C) 2010-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -24,12 +24,13 @@ import Control.Monad.State.Strict (StateT, evalState, evalStateT, get, gets, lift, modify) import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Char8 as B8 -import Data.Char (isAlphaNum, isAscii, isDigit, toLower) +import Data.Char (isAlphaNum, isAscii, isDigit) import Data.List (isInfixOf, isPrefixOf) import qualified Data.Map as M import Data.Maybe (fromMaybe, isNothing, mapMaybe, isJust) import qualified Data.Set as Set -import qualified Data.Text as TS +import qualified Data.Text as T +import Data.Text (Text) import qualified Data.Text.Lazy as TL import Network.HTTP (urlEncode) import System.FilePath (takeExtension, takeFileName, makeRelative) @@ -48,15 +49,13 @@ import Text.Pandoc.Options (EPUBVersion (..), HTMLMathMethod (..), ObfuscationMethod (NoObfuscation), WrapOption (..), WriterOptions (..)) import Text.Pandoc.Shared (makeSections, normalizeDate, renderTags', - safeRead, stringify, trim, uniqueIdent, tshow) + stringify, uniqueIdent, tshow) import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.UUID (getRandomUUID) import Text.Pandoc.Walk (query, walk, walkM) import Text.Pandoc.Writers.HTML (writeHtmlStringForEPUB) import Text.Printf (printf) -import Text.XML.Light (Attr (..), Element (..), Node (..), QName (..), - add_attrs, lookupAttr, node, onlyElems, parseXML, - ppElement, showElement, strContent, unode, unqual) +import Text.Pandoc.XML.Light import Text.Pandoc.XML (escapeStringForXML) import Text.DocTemplates (FromContext(lookupContext), Context(..), ToContext(toVal), Val(..)) @@ -68,69 +67,72 @@ newtype Chapter = Chapter [Block] data EPUBState = EPUBState { stMediaPaths :: [(FilePath, (FilePath, Maybe Entry))] , stMediaNextId :: Int - , stEpubSubdir :: String + , stEpubSubdir :: FilePath } type E m = StateT EPUBState m data EPUBMetadata = EPUBMetadata{ - epubIdentifier :: [Identifier] - , epubTitle :: [Title] - , epubDate :: [Date] - , epubLanguage :: String - , epubCreator :: [Creator] - , epubContributor :: [Creator] - , epubSubject :: [String] - , epubDescription :: Maybe String - , epubType :: Maybe String - , epubFormat :: Maybe String - , epubPublisher :: Maybe String - , epubSource :: Maybe String - , epubRelation :: Maybe String - , epubCoverage :: Maybe String - , epubRights :: Maybe String - , epubCoverImage :: Maybe String - , epubStylesheets :: [FilePath] - , epubPageDirection :: Maybe ProgressionDirection - , epubIbooksFields :: [(String, String)] - , epubCalibreFields :: [(String, String)] + epubIdentifier :: [Identifier] + , epubTitle :: [Title] + , epubDate :: [Date] + , epubLanguage :: Text + , epubCreator :: [Creator] + , epubContributor :: [Creator] + , epubSubject :: [Text] + , epubDescription :: Maybe Text + , epubType :: Maybe Text + , epubFormat :: Maybe Text + , epubPublisher :: Maybe Text + , epubSource :: Maybe Text + , epubRelation :: Maybe Text + , epubCoverage :: Maybe Text + , epubRights :: Maybe Text + , epubBelongsToCollection :: Maybe Text + , epubGroupPosition :: Maybe Text + , epubCoverImage :: Maybe FilePath + , epubStylesheets :: [FilePath] + , epubPageDirection :: Maybe ProgressionDirection + , epubIbooksFields :: [(Text, Text)] + , epubCalibreFields :: [(Text, Text)] } deriving Show data Date = Date{ - dateText :: String - , dateEvent :: Maybe String + dateText :: Text + , dateEvent :: Maybe Text } deriving Show data Creator = Creator{ - creatorText :: String - , creatorRole :: Maybe String - , creatorFileAs :: Maybe String + creatorText :: Text + , creatorRole :: Maybe Text + , creatorFileAs :: Maybe Text } deriving Show data Identifier = Identifier{ - identifierText :: String - , identifierScheme :: Maybe String + identifierText :: Text + , identifierScheme :: Maybe Text } deriving Show data Title = Title{ - titleText :: String - , titleFileAs :: Maybe String - , titleType :: Maybe String + titleText :: Text + , titleFileAs :: Maybe Text + , titleType :: Maybe Text } deriving Show data ProgressionDirection = LTR | RTL deriving Show -dcName :: String -> QName +dcName :: Text -> QName dcName n = QName n Nothing (Just "dc") -dcNode :: Node t => String -> t -> Element +dcNode :: Node t => Text -> t -> Element dcNode = node . dcName -opfName :: String -> QName +opfName :: Text -> QName opfName n = QName n Nothing (Just "opf") -toId :: FilePath -> String -toId = map (\x -> if isAlphaNum x || x == '-' || x == '_' +toId :: FilePath -> Text +toId = T.pack . + map (\x -> if isAlphaNum x || x == '-' || x == '_' then x else '_') . takeFileName @@ -138,8 +140,8 @@ removeNote :: Inline -> Inline removeNote (Note _) = Str "" removeNote x = x -toVal' :: String -> Val TS.Text -toVal' = toVal . TS.pack +toVal' :: Text -> Val T.Text +toVal' = toVal mkEntry :: PandocMonad m => FilePath -> B.ByteString -> E m Entry mkEntry path content = do @@ -158,32 +160,37 @@ mkEntry path content = do getEPUBMetadata :: PandocMonad m => WriterOptions -> Meta -> E m EPUBMetadata getEPUBMetadata opts meta = do let md = metadataFromMeta opts meta - let elts = maybe [] (onlyElems . parseXML) $ writerEpubMetadata opts + elts <- case writerEpubMetadata opts of + Nothing -> return [] + Just t -> case parseXMLContents (TL.fromStrict t) of + Left msg -> throwError $ + PandocXMLError "epub metadata" msg + Right ns -> return (onlyElems ns) let md' = foldr addMetadataFromXML md elts let addIdentifier m = if null (epubIdentifier m) then do randomId <- getRandomUUID - return $ m{ epubIdentifier = [Identifier (show randomId) Nothing] } + return $ m{ epubIdentifier = [Identifier (tshow randomId) Nothing] } else return m let addLanguage m = - if null (epubLanguage m) + if T.null (epubLanguage m) then case lookupContext "lang" (writerVariables opts) of - Just x -> return m{ epubLanguage = TS.unpack x } + Just x -> return m{ epubLanguage = x } Nothing -> do mLang <- lift $ P.lookupEnv "LANG" let localeLang = case mLang of Just lang -> - TS.map (\c -> if c == '_' then '-' else c) $ - TS.takeWhile (/='.') lang + T.map (\c -> if c == '_' then '-' else c) $ + T.takeWhile (/='.') lang Nothing -> "en-US" - return m{ epubLanguage = TS.unpack localeLang } + return m{ epubLanguage = localeLang } else return m let fixDate m = if null (epubDate m) then do - currentTime <- lift P.getCurrentTime + currentTime <- lift P.getTimestamp return $ m{ epubDate = [ Date{ dateText = showDateTimeISO8601 currentTime , dateEvent = Nothing } ] } @@ -193,7 +200,7 @@ getEPUBMetadata opts meta = do then return m else do let authors' = map stringify $ docAuthors meta - let toAuthor name = Creator{ creatorText = TS.unpack name + let toAuthor name = Creator{ creatorText = name , creatorRole = Just "aut" , creatorFileAs = Nothing } return $ m{ epubCreator = map toAuthor authors' ++ epubCreator m } @@ -235,35 +242,37 @@ addMetadataFromXML e@(Element (QName name _ (Just "dc")) attrs _ _) md | name == "relation" = md { epubRelation = Just $ strContent e } | name == "coverage" = md { epubCoverage = Just $ strContent e } | name == "rights" = md { epubRights = Just $ strContent e } + | name == "belongs-to-collection" = md { epubBelongsToCollection = Just $ strContent e } + | name == "group-position" = md { epubGroupPosition = Just $ strContent e } | otherwise = md where getAttr n = lookupAttr (opfName n) attrs addMetadataFromXML e@(Element (QName "meta" _ _) attrs _ _) md = case getAttr "property" of - Just s | "ibooks:" `isPrefixOf` s -> - md{ epubIbooksFields = (drop 7 s, strContent e) : + Just s | "ibooks:" `T.isPrefixOf` s -> + md{ epubIbooksFields = (T.drop 7 s, strContent e) : epubIbooksFields md } _ -> case getAttr "name" of - Just s | "calibre:" `isPrefixOf` s -> + Just s | "calibre:" `T.isPrefixOf` s -> md{ epubCalibreFields = - (drop 8 s, fromMaybe "" $ getAttr "content") : + (T.drop 8 s, fromMaybe "" $ getAttr "content") : epubCalibreFields md } _ -> md where getAttr n = lookupAttr (unqual n) attrs addMetadataFromXML _ md = md -metaValueToString :: MetaValue -> String -metaValueToString (MetaString s) = TS.unpack s -metaValueToString (MetaInlines ils) = TS.unpack $ stringify ils -metaValueToString (MetaBlocks bs) = TS.unpack $ stringify bs +metaValueToString :: MetaValue -> Text +metaValueToString (MetaString s) = s +metaValueToString (MetaInlines ils) = stringify ils +metaValueToString (MetaBlocks bs) = stringify bs metaValueToString (MetaBool True) = "true" metaValueToString (MetaBool False) = "false" metaValueToString _ = "" metaValueToPaths :: MetaValue -> [FilePath] -metaValueToPaths (MetaList xs) = map metaValueToString xs -metaValueToPaths x = [metaValueToString x] +metaValueToPaths (MetaList xs) = map (T.unpack . metaValueToString) xs +metaValueToPaths x = [T.unpack $ metaValueToString x] -getList :: TS.Text -> Meta -> (MetaValue -> a) -> [a] +getList :: T.Text -> Meta -> (MetaValue -> a) -> [a] getList s meta handleMetaValue = case lookupMeta s meta of Just (MetaList xs) -> map handleMetaValue xs @@ -287,7 +296,7 @@ getTitle meta = getList "title" meta handleMetaValue , titleType = metaValueToString <$> M.lookup "type" m } handleMetaValue mv = Title (metaValueToString mv) Nothing Nothing -getCreator :: TS.Text -> Meta -> [Creator] +getCreator :: T.Text -> Meta -> [Creator] getCreator s meta = getList s meta handleMetaValue where handleMetaValue (MetaMap m) = Creator{ creatorText = maybe "" metaValueToString $ M.lookup "text" m @@ -295,7 +304,7 @@ getCreator s meta = getList s meta handleMetaValue , creatorRole = metaValueToString <$> M.lookup "role" m } handleMetaValue mv = Creator (metaValueToString mv) Nothing Nothing -getDate :: TS.Text -> Meta -> [Date] +getDate :: T.Text -> Meta -> [Date] getDate s meta = getList s meta handleMetaValue where handleMetaValue (MetaMap m) = Date{ dateText = fromMaybe "" $ @@ -304,7 +313,7 @@ getDate s meta = getList s meta handleMetaValue handleMetaValue mv = Date { dateText = fromMaybe "" $ normalizeDate' $ metaValueToString mv , dateEvent = Nothing } -simpleList :: TS.Text -> Meta -> [String] +simpleList :: T.Text -> Meta -> [Text] simpleList s meta = case lookupMeta s meta of Just (MetaList xs) -> map metaValueToString xs @@ -313,26 +322,28 @@ simpleList s meta = metadataFromMeta :: WriterOptions -> Meta -> EPUBMetadata metadataFromMeta opts meta = EPUBMetadata{ - epubIdentifier = identifiers - , epubTitle = titles - , epubDate = date - , epubLanguage = language - , epubCreator = creators - , epubContributor = contributors - , epubSubject = subjects - , epubDescription = description - , epubType = epubtype - , epubFormat = format - , epubPublisher = publisher - , epubSource = source - , epubRelation = relation - , epubCoverage = coverage - , epubRights = rights - , epubCoverImage = coverImage - , epubStylesheets = stylesheets - , epubPageDirection = pageDirection - , epubIbooksFields = ibooksFields - , epubCalibreFields = calibreFields + epubIdentifier = identifiers + , epubTitle = titles + , epubDate = date + , epubLanguage = language + , epubCreator = creators + , epubContributor = contributors + , epubSubject = subjects + , epubDescription = description + , epubType = epubtype + , epubFormat = format + , epubPublisher = publisher + , epubSource = source + , epubRelation = relation + , epubCoverage = coverage + , epubRights = rights + , epubBelongsToCollection = belongsToCollection + , epubGroupPosition = groupPosition + , epubCoverImage = coverImage + , epubStylesheets = stylesheets + , epubPageDirection = pageDirection + , epubIbooksFields = ibooksFields + , epubCalibreFields = calibreFields } where identifiers = getIdentifier meta titles = getTitle meta @@ -350,30 +361,31 @@ metadataFromMeta opts meta = EPUBMetadata{ relation = metaValueToString <$> lookupMeta "relation" meta coverage = metaValueToString <$> lookupMeta "coverage" meta rights = metaValueToString <$> lookupMeta "rights" meta - coverImage = - (TS.unpack <$> lookupContext "epub-cover-image" - (writerVariables opts)) + belongsToCollection = metaValueToString <$> lookupMeta "belongs-to-collection" meta + groupPosition = metaValueToString <$> lookupMeta "group-position" meta + coverImage = T.unpack <$> + lookupContext "epub-cover-image" (writerVariables opts) `mplus` (metaValueToString <$> lookupMeta "cover-image" meta) mCss = lookupMeta "css" meta <|> lookupMeta "stylesheet" meta stylesheets = maybe [] metaValueToPaths mCss ++ case lookupContext "css" (writerVariables opts) of - Just xs -> map TS.unpack xs + Just xs -> map T.unpack xs Nothing -> case lookupContext "css" (writerVariables opts) of - Just x -> [TS.unpack x] + Just x -> [T.unpack x] Nothing -> [] - pageDirection = case map toLower . metaValueToString <$> + pageDirection = case T.toLower . metaValueToString <$> lookupMeta "page-progression-direction" meta of Just "ltr" -> Just LTR Just "rtl" -> Just RTL _ -> Nothing ibooksFields = case lookupMeta "ibooks" meta of Just (MetaMap mp) - -> M.toList $ M.mapKeys TS.unpack $ M.map metaValueToString mp + -> M.toList $ M.map metaValueToString mp _ -> [] calibreFields = case lookupMeta "calibre" meta of Just (MetaMap mp) - -> M.toList $ M.mapKeys TS.unpack $ M.map metaValueToString mp + -> M.toList $ M.map metaValueToString mp _ -> [] -- | Produce an EPUB2 file from a Pandoc document. @@ -399,9 +411,11 @@ writeEPUB :: PandocMonad m writeEPUB epubVersion opts doc = do let epubSubdir = writerEpubSubdirectory opts -- sanity check on epubSubdir - unless (TS.all (\c -> isAscii c && isAlphaNum c) epubSubdir) $ + unless (T.all (\c -> isAscii c && isAlphaNum c) epubSubdir) $ throwError $ PandocEpubSubdirectoryError epubSubdir - let initState = EPUBState { stMediaPaths = [], stMediaNextId = 0, stEpubSubdir = TS.unpack epubSubdir } + let initState = EPUBState { stMediaPaths = [] + , stMediaNextId = 0 + , stEpubSubdir = T.unpack epubSubdir } evalStateT (pandocToEPUB epubVersion opts doc) initState pandocToEPUB :: PandocMonad m @@ -425,7 +439,7 @@ pandocToEPUB version opts doc = do [] -> case epubTitle metadata of [] -> "UNTITLED" (x:_) -> titleText x - x -> TS.unpack $ stringify x + x -> stringify x -- stylesheet stylesheets <- case epubStylesheets metadata of @@ -447,7 +461,8 @@ pandocToEPUB version opts doc = do (ListVal $ map (\e -> toVal' $ (if useprefix then "../" else "") <> - makeRelative epubSubdir (eRelativePath e)) + T.pack + (makeRelative epubSubdir (eRelativePath e))) stylesheetEntries) mempty @@ -465,28 +480,34 @@ pandocToEPUB version opts doc = do case epubCoverImage metadata of Nothing -> return ([],[]) Just img -> do - let coverImage = takeFileName img + let fp = takeFileName img + mediaPaths <- gets (map (fst . snd) . stMediaPaths) + coverImageName <- -- see #4206 + if ("media/" <> fp) `elem` mediaPaths + then getMediaNextNewName (takeExtension fp) + else return fp imgContent <- lift $ P.readFileLazy img (coverImageWidth, coverImageHeight) <- case imageSize opts' (B.toStrict imgContent) of Right sz -> return $ sizeInPixels sz Left err' -> (0, 0) <$ report - (CouldNotDetermineImageSize (TS.pack img) err') + (CouldNotDetermineImageSize (T.pack img) err') cpContent <- lift $ writeHtml opts'{ writerVariables = Context (M.fromList [ ("coverpage", toVal' "true"), ("pagetitle", toVal $ - escapeStringForXML $ TS.pack plainTitle), - ("cover-image", toVal' coverImage), + escapeStringForXML plainTitle), + ("cover-image", + toVal' $ T.pack coverImageName), ("cover-image-width", toVal' $ - show coverImageWidth), + tshow coverImageWidth), ("cover-image-height", toVal' $ - show coverImageHeight)]) <> + tshow coverImageHeight)]) <> cssvars True <> vars } (Pandoc meta []) coverEntry <- mkEntry "text/cover.xhtml" cpContent - coverImageEntry <- mkEntry ("media/" ++ coverImage) + coverImageEntry <- mkEntry ("media/" ++ coverImageName) imgContent return ( [ coverEntry ] , [ coverImageEntry ] ) @@ -498,7 +519,7 @@ pandocToEPUB version opts doc = do ("titlepage", toVal' "true"), ("body-type", toVal' "frontmatter"), ("pagetitle", toVal $ - escapeStringForXML $ TS.pack plainTitle)]) + escapeStringForXML plainTitle)]) <> cssvars True <> vars } (Pandoc meta []) tpEntry <- mkEntry "text/title_page.xhtml" tpContent @@ -507,7 +528,7 @@ pandocToEPUB version opts doc = do let matchingGlob f = do xs <- lift $ P.glob f when (null xs) $ - report $ CouldNotFetchResource (TS.pack f) "glob did not match any font files" + report $ CouldNotFetchResource (T.pack f) "glob did not match any font files" return xs let mkFontEntry f = mkEntry ("fonts/" ++ takeFileName f) =<< lift (P.readFileLazy f) @@ -554,16 +575,42 @@ pandocToEPUB version opts doc = do let chapters' = secsToChapters secs - let extractLinkURL' :: Int -> Inline -> [(TS.Text, TS.Text)] + let extractLinkURL' :: Int -> Inline -> [(T.Text, T.Text)] extractLinkURL' num (Span (ident, _, _) _) - | not (TS.null ident) = [(ident, TS.pack (showChapter num) <> "#" <> ident)] + | not (T.null ident) = [(ident, showChapter num <> "#" <> ident)] + extractLinkURL' num (Link (ident, _, _) _ _) + | not (T.null ident) = [(ident, showChapter num <> "#" <> ident)] + extractLinkURL' num (Image (ident, _, _) _ _) + | not (T.null ident) = [(ident, showChapter num <> "#" <> ident)] + extractLinkURL' num (RawInline fmt raw) + | isHtmlFormat fmt + = foldr (\tag -> + case tag of + TagOpen{} -> + case fromAttrib "id" tag of + "" -> id + x -> ((x, showChapter num <> "#" <> x):) + _ -> id) + [] (parseTags raw) extractLinkURL' _ _ = [] - let extractLinkURL :: Int -> Block -> [(TS.Text, TS.Text)] + let extractLinkURL :: Int -> Block -> [(T.Text, T.Text)] extractLinkURL num (Div (ident, _, _) _) - | not (TS.null ident) = [(ident, TS.pack (showChapter num) <> "#" <> ident)] + | not (T.null ident) = [(ident, showChapter num <> "#" <> ident)] extractLinkURL num (Header _ (ident, _, _) _) - | not (TS.null ident) = [(ident, TS.pack (showChapter num) <> "#" <> ident)] + | not (T.null ident) = [(ident, showChapter num <> "#" <> ident)] + extractLinkURL num (Table (ident,_,_) _ _ _ _ _) + | not (T.null ident) = [(ident, showChapter num <> "#" <> ident)] + extractLinkURL num (RawBlock fmt raw) + | isHtmlFormat fmt + = foldr (\tag -> + case tag of + TagOpen{} -> + case fromAttrib "id" tag of + "" -> id + x -> ((x, showChapter num <> "#" <> x):) + _ -> id) + [] (parseTags raw) extractLinkURL num b = query (extractLinkURL' num) b let reftable = concat $ zipWith (\(Chapter bs) num -> @@ -572,7 +619,7 @@ pandocToEPUB version opts doc = do let fixInternalReferences :: Inline -> Inline fixInternalReferences (Link attr lab (src, tit)) - | Just ('#', xs) <- TS.uncons src = case lookup xs reftable of + | Just ('#', xs) <- T.uncons src = case lookup xs reftable of Just ys -> Link attr lab (ys, tit) Nothing -> Link attr lab (src, tit) fixInternalReferences x = x @@ -585,7 +632,7 @@ pandocToEPUB version opts doc = do chapters' let chapToEntry num (Chapter bs) = - mkEntry ("text/" ++ showChapter num) =<< + mkEntry ("text/" ++ T.unpack (showChapter num)) =<< writeHtml opts'{ writerVariables = Context (M.fromList [("body-type", toVal' bodyType), @@ -632,12 +679,12 @@ pandocToEPUB version opts doc = do let chapterNode ent = unode "item" ! ([("id", toId $ makeRelative epubSubdir $ eRelativePath ent), - ("href", makeRelative epubSubdir + ("href", T.pack $ makeRelative epubSubdir $ eRelativePath ent), ("media-type", "application/xhtml+xml")] ++ case props ent of [] -> [] - xs -> [("properties", unwords xs)]) + xs -> [("properties", T.unwords xs)]) $ () let chapterRefNode ent = unode "itemref" ! @@ -646,17 +693,17 @@ pandocToEPUB version opts doc = do let pictureNode ent = unode "item" ! [("id", toId $ makeRelative epubSubdir $ eRelativePath ent), - ("href", makeRelative epubSubdir + ("href", T.pack $ makeRelative epubSubdir $ eRelativePath ent), ("media-type", - maybe "application/octet-stream" TS.unpack + fromMaybe "application/octet-stream" $ mediaTypeOf $ eRelativePath ent)] $ () let fontNode ent = unode "item" ! [("id", toId $ makeRelative epubSubdir $ eRelativePath ent), - ("href", makeRelative epubSubdir + ("href", T.pack $ makeRelative epubSubdir $ eRelativePath ent), - ("media-type", maybe "" TS.unpack $ + ("media-type", fromMaybe "" $ getMimeType $ eRelativePath ent)] $ () let tocTitle = maybe plainTitle @@ -664,8 +711,8 @@ pandocToEPUB version opts doc = do uuid <- case epubIdentifier metadata of (x:_) -> return $ identifierText x -- use first identifier as UUID [] -> throwError $ PandocShouldNeverHappenError "epubIdentifier is null" -- shouldn't happen - currentTime <- lift P.getCurrentTime - let contentsData = UTF8.fromStringLazy $ ppTopElement $ + currentTime <- lift P.getTimestamp + let contentsData = UTF8.fromTextLazy $ TL.fromStrict $ ppTopElement $ unode "package" ! ([("version", case version of EPUB2 -> "2.0" @@ -683,7 +730,8 @@ pandocToEPUB version opts doc = do ,("media-type","application/xhtml+xml")] ++ [("properties","nav") | epub3 ]) $ () ] ++ - [ unode "item" ! [("id","stylesheet" ++ show n), ("href",fp) + [ unode "item" ! [("id","stylesheet" <> tshow n) + , ("href", T.pack fp) ,("media-type","text/css")] $ () | (n :: Int, fp) <- zip [1..] (map (makeRelative epubSubdir . eRelativePath) @@ -728,7 +776,7 @@ pandocToEPUB version opts doc = do let tocLevel = writerTOCDepth opts let navPointNode :: PandocMonad m - => (Int -> [Inline] -> TS.Text -> [Element] -> Element) + => (Int -> [Inline] -> T.Text -> [Element] -> Element) -> Block -> StateT Int m [Element] navPointNode formatter (Div (ident,_,_) (Header lvl (_,_,kvs) ils : children)) = @@ -738,7 +786,7 @@ pandocToEPUB version opts doc = do n <- get modify (+1) let num = fromMaybe "" $ lookup "number" kvs - let tit = if writerNumberSections opts && not (TS.null num) + let tit = if writerNumberSections opts && not (T.null num) then Span ("", ["section-header-number"], []) [Str num] : Space : ils else ils @@ -752,21 +800,21 @@ pandocToEPUB version opts doc = do concat <$> mapM (navPointNode formatter) bs navPointNode _ _ = return [] - let navMapFormatter :: Int -> [Inline] -> TS.Text -> [Element] -> Element + let navMapFormatter :: Int -> [Inline] -> T.Text -> [Element] -> Element navMapFormatter n tit src subs = unode "navPoint" ! - [("id", "navPoint-" ++ show n)] $ - [ unode "navLabel" $ unode "text" $ TS.unpack $ stringify tit - , unode "content" ! [("src", "text/" <> TS.unpack src)] $ () + [("id", "navPoint-" <> tshow n)] $ + [ unode "navLabel" $ unode "text" $ stringify tit + , unode "content" ! [("src", "text/" <> src)] $ () ] ++ subs let tpNode = unode "navPoint" ! [("id", "navPoint-0")] $ - [ unode "navLabel" $ unode "text" (TS.unpack $ stringify $ docTitle' meta) + [ unode "navLabel" $ unode "text" (stringify $ docTitle' meta) , unode "content" ! [("src", "text/title_page.xhtml")] $ () ] navMap <- lift $ evalStateT (concat <$> mapM (navPointNode navMapFormatter) secs) 1 - let tocData = UTF8.fromStringLazy $ ppTopElement $ + let tocData = B.fromStrict $ UTF8.fromText $ ppTopElement $ unode "ncx" ! [("version","2005-1") ,("xmlns","http://www.daisy.org/z3986/2005/ncx/")] $ [ unode "head" $ @@ -788,23 +836,24 @@ pandocToEPUB version opts doc = do ] tocEntry <- mkEntry "toc.ncx" tocData - let navXhtmlFormatter :: Int -> [Inline] -> TS.Text -> [Element] -> Element + let navXhtmlFormatter :: Int -> [Inline] -> T.Text -> [Element] -> Element navXhtmlFormatter n tit src subs = unode "li" ! - [("id", "toc-li-" ++ show n)] $ + [("id", "toc-li-" <> tshow n)] $ (unode "a" ! - [("href", "text/" <> TS.unpack src)] + [("href", "text/" <> src)] $ titElements) : case subs of [] -> [] (_:_) -> [unode "ol" ! [("class","toc")] $ subs] - where titElements = parseXML titRendered + where titElements = either (const []) id $ + parseXMLContents (TL.fromStrict titRendered) titRendered = case P.runPure (writeHtmlStringForEPUB version opts{ writerTemplate = Nothing , writerVariables = Context (M.fromList [("pagetitle", toVal $ - escapeStringForXML $ TS.pack plainTitle)]) + escapeStringForXML plainTitle)]) <> writerVariables opts} (Pandoc nullMeta [Plain $ walk clean tit])) of @@ -819,33 +868,40 @@ pandocToEPUB version opts doc = do tocBlocks <- lift $ evalStateT (concat <$> mapM (navPointNode navXhtmlFormatter) secs) 1 let navBlocks = [RawBlock (Format "html") - $ TS.pack $ showElement $ -- prettyprinting introduces bad spaces + $ showElement $ -- prettyprinting introduces bad spaces unode navtag ! ([("epub:type","toc") | epub3] ++ [("id","toc")]) $ [ unode "h1" ! [("id","toc-title")] $ tocTitle , unode "ol" ! [("class","toc")] $ tocBlocks ]] let landmarkItems = if epub3 - then [ unode "li" + then unode "li" + [ unode "a" ! [("href", + "text/title_page.xhtml") + ,("epub:type", "titlepage")] $ + ("Title Page" :: Text) ] : + [ unode "li" [ unode "a" ! [("href", "text/cover.xhtml") ,("epub:type", "cover")] $ - ("Cover" :: String)] | + ("Cover" :: Text)] | isJust (epubCoverImage metadata) ] ++ [ unode "li" [ unode "a" ! [("href", "#toc") ,("epub:type", "toc")] $ - ("Table of contents" :: String) + ("Table of Contents" :: Text) ] | writerTableOfContents opts ] else [] - let landmarks = [RawBlock (Format "html") $ TS.pack $ ppElement $ + let landmarks = [RawBlock (Format "html") $ ppElement $ unode "nav" ! [("epub:type","landmarks") ,("id","landmarks") ,("hidden","hidden")] $ [ unode "ol" landmarkItems ] | not (null landmarkItems)] navData <- lift $ writeHtml opts'{ writerVariables = - Context (M.fromList [("navpage", toVal' "true")]) + Context (M.fromList [("navpage", toVal' "true") + ,("body-type", toVal' "frontmatter") + ]) <> cssvars False <> vars } (Pandoc (setMeta "title" (walk removeNote $ fromList $ docTitle' meta) nullMeta) @@ -857,22 +913,22 @@ pandocToEPUB version opts doc = do UTF8.fromStringLazy "application/epub+zip" -- container.xml - let containerData = UTF8.fromStringLazy $ ppTopElement $ + let containerData = B.fromStrict $ UTF8.fromText $ ppTopElement $ unode "container" ! [("version","1.0") ,("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")] $ unode "rootfiles" $ unode "rootfile" ! [("full-path", (if null epubSubdir then "" - else epubSubdir ++ "/") ++ "content.opf") + else T.pack epubSubdir <> "/") <> "content.opf") ,("media-type","application/oebps-package+xml")] $ () containerEntry <- mkEntry "META-INF/container.xml" containerData -- com.apple.ibooks.display-options.xml - let apple = UTF8.fromStringLazy $ ppTopElement $ + let apple = B.fromStrict $ UTF8.fromText $ ppTopElement $ unode "display_options" $ unode "platform" ! [("name","*")] $ - unode "option" ! [("name","specified-fonts")] $ ("true" :: String) + unode "option" ! [("name","specified-fonts")] $ ("true" :: Text) appleEntry <- mkEntry "META-INF/com.apple.ibooks.display-options.xml" apple -- construct archive @@ -893,8 +949,9 @@ metadataElement version md currentTime = ++ descriptionNodes ++ typeNodes ++ formatNodes ++ publisherNodes ++ sourceNodes ++ relationNodes ++ coverageNodes ++ rightsNodes ++ coverImageNodes - ++ modifiedNodes - withIds base f = concat . zipWith f (map (\x -> base ++ ('-' : show x)) + ++ modifiedNodes ++ belongsToCollectionNodes + withIds base f = concat . zipWith f (map (\x -> base <> + T.cons '-' (tshow x)) ([1..] :: [Int])) identifierNodes = withIds "epub-id" toIdentifierNode $ epubIdentifier md @@ -908,9 +965,9 @@ metadataElement version md currentTime = (x:_) -> [dcNode "date" ! [("id","epub-date")] $ dateText x] ibooksNodes = map ibooksNode (epubIbooksFields md) - ibooksNode (k, v) = unode "meta" ! [("property", "ibooks:" ++ k)] $ v + ibooksNode (k, v) = unode "meta" ! [("property", "ibooks:" <> k)] $ v calibreNodes = map calibreNode (epubCalibreFields md) - calibreNode (k, v) = unode "meta" ! [("name", "calibre:" ++ k), + calibreNode (k, v) = unode "meta" ! [("name", "calibre:" <> k), ("content", v)] $ () languageNodes = [dcTag "language" $ epubLanguage md] creatorNodes = withIds "epub-creator" (toCreatorNode "creator") $ @@ -932,7 +989,16 @@ metadataElement version md currentTime = $ epubCoverImage md modifiedNodes = [ unode "meta" ! [("property", "dcterms:modified")] $ showDateTimeISO8601 currentTime | version == EPUB3 ] - dcTag n s = unode ("dc:" ++ n) s + belongsToCollectionNodes = + maybe [] + (\belongsToCollection -> (unode "meta" ! [("property", "belongs-to-collection"), ("id", "epub-collection-1")] $ belongsToCollection ) + : + [unode "meta" ! [("refines", "#epub-collection-1"), ("property", "collection-type")] $ ("series" :: Text) ]) + (epubBelongsToCollection md)++ + maybe [] + (\groupPosition -> [unode "meta" ! [("refines", "#epub-collection-1"), ("property", "group-position")] $ groupPosition ]) + (epubGroupPosition md) + dcTag n s = unode ("dc:" <> n) s dcTag' n s = [dcTag n s] toIdentifierNode id' (Identifier txt scheme) | version == EPUB2 = [dcNode "identifier" ! @@ -940,7 +1006,7 @@ metadataElement version md currentTime = txt] | otherwise = (dcNode "identifier" ! [("id",id')] $ txt) : maybe [] ((\x -> [unode "meta" ! - [ ("refines",'#':id') + [ ("refines","#" <> id') , ("property","identifier-type") , ("scheme","onix:codelist5") ] @@ -956,10 +1022,10 @@ metadataElement version md currentTime = (creatorRole creator >>= toRelator)) $ creatorText creator] | otherwise = [dcNode s ! [("id",id')] $ creatorText creator] ++ maybe [] (\x -> [unode "meta" ! - [("refines",'#':id'),("property","file-as")] $ x]) + [("refines","#" <> id'),("property","file-as")] $ x]) (creatorFileAs creator) ++ maybe [] (\x -> [unode "meta" ! - [("refines",'#':id'),("property","role"), + [("refines","#" <> id'),("property","role"), ("scheme","marc:relators")] $ x]) (creatorRole creator >>= toRelator) toTitleNode id' title @@ -971,16 +1037,16 @@ metadataElement version md currentTime = | otherwise = [dcNode "title" ! [("id",id')] $ titleText title] ++ maybe [] (\x -> [unode "meta" ! - [("refines",'#':id'),("property","file-as")] $ x]) + [("refines","#" <> id'),("property","file-as")] $ x]) (titleFileAs title) ++ maybe [] (\x -> [unode "meta" ! - [("refines",'#':id'),("property","title-type")] $ x]) + [("refines","#" <> id'),("property","title-type")] $ x]) (titleType title) toDateNode id' date = [dcNode "date" ! (("id",id') : maybe [] (\x -> [("opf:event",x)]) (dateEvent date)) $ dateText date] - schemeToOnix :: String -> String + schemeToOnix :: Text -> Text schemeToOnix "ISBN-10" = "02" schemeToOnix "GTIN-13" = "03" schemeToOnix "UPC" = "04" @@ -998,59 +1064,64 @@ metadataElement version md currentTime = schemeToOnix "OLCC" = "28" schemeToOnix _ = "01" -showDateTimeISO8601 :: UTCTime -> String -showDateTimeISO8601 = formatTime defaultTimeLocale "%FT%TZ" +showDateTimeISO8601 :: UTCTime -> Text +showDateTimeISO8601 = T.pack . formatTime defaultTimeLocale "%FT%TZ" transformTag :: PandocMonad m - => Tag TS.Text - -> E m (Tag TS.Text) + => Tag T.Text + -> E m (Tag T.Text) transformTag tag@(TagOpen name attr) | name `elem` ["video", "source", "img", "audio"] && isNothing (lookup "data-external" attr) = do let src = fromAttrib "src" tag let poster = fromAttrib "poster" tag - newsrc <- modifyMediaRef $ TS.unpack src - newposter <- modifyMediaRef $ TS.unpack poster + newsrc <- modifyMediaRef $ T.unpack src + newposter <- modifyMediaRef $ T.unpack poster let attr' = filter (\(x,_) -> x /= "src" && x /= "poster") attr ++ - [("src", "../" <> newsrc) | not (TS.null newsrc)] ++ - [("poster", "../" <> newposter) | not (TS.null newposter)] + [("src", "../" <> newsrc) | not (T.null newsrc)] ++ + [("poster", "../" <> newposter) | not (T.null newposter)] return $ TagOpen name attr' transformTag tag = return tag modifyMediaRef :: PandocMonad m => FilePath - -> E m TS.Text + -> E m T.Text modifyMediaRef "" = return "" modifyMediaRef oldsrc = do media <- gets stMediaPaths case lookup oldsrc media of - Just (n,_) -> return $ TS.pack n + Just (n,_) -> return $ T.pack n Nothing -> catchError - (do (img, mbMime) <- P.fetchItem $ TS.pack oldsrc - let ext = maybe (takeExtension (takeWhile (/='?') oldsrc)) TS.unpack + (do (img, mbMime) <- P.fetchItem $ T.pack oldsrc + let ext = maybe (takeExtension (takeWhile (/='?') oldsrc)) T.unpack (("." <>) <$> (mbMime >>= extensionFromMimeType)) newName <- getMediaNextNewName ext let newPath = "media/" ++ newName entry <- mkEntry newPath (B.fromChunks . (:[]) $ img) modify $ \st -> st{ stMediaPaths = (oldsrc, (newPath, Just entry)):media} - return $ TS.pack newPath) + return $ T.pack newPath) (\e -> do - report $ CouldNotFetchResource (TS.pack oldsrc) (tshow e) - return $ TS.pack oldsrc) + report $ CouldNotFetchResource (T.pack oldsrc) (tshow e) + return $ T.pack oldsrc) -getMediaNextNewName :: PandocMonad m => String -> E m String +getMediaNextNewName :: PandocMonad m => FilePath -> E m FilePath getMediaNextNewName ext = do nextId <- gets stMediaNextId modify $ \st -> st { stMediaNextId = nextId + 1 } - let nextName = "file" ++ show nextId ++ ext - (P.fetchItem (TS.pack nextName) >> getMediaNextNewName ext) `catchError` const (return nextName) + return $ "file" ++ show nextId ++ ext + +isHtmlFormat :: Format -> Bool +isHtmlFormat (Format "html") = True +isHtmlFormat (Format "html4") = True +isHtmlFormat (Format "html5") = True +isHtmlFormat _ = False transformBlock :: PandocMonad m => Block -> E m Block transformBlock (RawBlock fmt raw) - | fmt == Format "html" = do + | isHtmlFormat fmt = do let tags = parseTags raw tags' <- mapM transformTag tags return $ RawBlock fmt (renderTags' tags') @@ -1060,56 +1131,43 @@ transformInline :: PandocMonad m => WriterOptions -> Inline -> E m Inline -transformInline _opts (Image attr lab (src,tit)) = do - newsrc <- modifyMediaRef $ TS.unpack src +transformInline _opts (Image attr@(_,_,kvs) lab (src,tit)) + | isNothing (lookup "external" kvs) = do + newsrc <- modifyMediaRef $ T.unpack src return $ Image attr lab ("../" <> newsrc, tit) transformInline opts x@(Math t m) | WebTeX url <- writerHTMLMathMethod opts = do - newsrc <- modifyMediaRef (TS.unpack url <> urlEncode (TS.unpack m)) + newsrc <- modifyMediaRef (T.unpack url <> urlEncode (T.unpack m)) let mathclass = if t == DisplayMath then "display" else "inline" return $ Span ("",["math",mathclass],[]) [Image nullAttr [x] ("../" <> newsrc, "")] transformInline _opts (RawInline fmt raw) - | fmt == Format "html" = do + | isHtmlFormat fmt = do let tags = parseTags raw tags' <- mapM transformTag tags return $ RawInline fmt (renderTags' tags') transformInline _ x = return x -(!) :: (t -> Element) -> [(String, String)] -> t -> Element +(!) :: (t -> Element) -> [(Text, Text)] -> t -> Element (!) f attrs n = add_attrs (map (\(k,v) -> Attr (unqual k) v) attrs) (f n) --- | Version of 'ppTopElement' that specifies UTF-8 encoding. -ppTopElement :: Element -> String -ppTopElement = ("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" ++) . unEntity . ppElement - -- unEntity removes numeric entities introduced by ppElement - -- (kindlegen seems to choke on these). - where unEntity [] = "" - unEntity ('&':'#':xs) = - let (ds,ys) = break (==';') xs - rest = drop 1 ys - in case safeRead (TS.pack $ "'\\" <> ds <> "'") of - Just x -> x : unEntity rest - Nothing -> '&':'#':unEntity xs - unEntity (x:xs) = x : unEntity xs - mediaTypeOf :: FilePath -> Maybe MimeType mediaTypeOf x = let mediaPrefixes = ["image", "video", "audio"] in case getMimeType x of - Just y | any (`TS.isPrefixOf` y) mediaPrefixes -> Just y + Just y | any (`T.isPrefixOf` y) mediaPrefixes -> Just y _ -> Nothing -- Returns filename for chapter number. -showChapter :: Int -> String -showChapter = printf "ch%03d.xhtml" +showChapter :: Int -> Text +showChapter = T.pack . printf "ch%03d.xhtml" -- Add identifiers to any headers without them. addIdentifiers :: WriterOptions -> [Block] -> [Block] addIdentifiers opts bs = evalState (mapM go bs) Set.empty where go (Header n (ident,classes,kvs) ils) = do ids <- get - let ident' = if TS.null ident + let ident' = if T.null ident then uniqueIdent (writerExtensions opts) ils ids else ident modify $ Set.insert ident' @@ -1117,27 +1175,27 @@ addIdentifiers opts bs = evalState (mapM go bs) Set.empty go x = return x -- Variant of normalizeDate that allows partial dates: YYYY, YYYY-MM -normalizeDate' :: String -> Maybe String -normalizeDate' = fmap TS.unpack . go . trim . TS.pack +normalizeDate' :: Text -> Maybe Text +normalizeDate' = go . T.strip where go xs - | TS.length xs == 4 -- YYY - , TS.all isDigit xs = Just xs - | (y, s) <- TS.splitAt 4 xs -- YYY-MM - , Just ('-', m) <- TS.uncons s - , TS.length m == 2 - , TS.all isDigit y && TS.all isDigit m = Just xs + | T.length xs == 4 -- YYY + , T.all isDigit xs = Just xs + | (y, s) <- T.splitAt 4 xs -- YYY-MM + , Just ('-', m) <- T.uncons s + , T.length m == 2 + , T.all isDigit y && T.all isDigit m = Just xs | otherwise = normalizeDate xs -toRelator :: String -> Maybe String +toRelator :: Text -> Maybe Text toRelator x | x `elem` relators = Just x - | otherwise = lookup (map toLower x) relatorMap + | otherwise = lookup (T.toLower x) relatorMap -relators :: [String] +relators :: [Text] relators = map snd relatorMap -relatorMap :: [(String, String)] +relatorMap :: [(Text, Text)] relatorMap = [("abridger", "abr") ,("actor", "act") diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs index 701ff3d9b..3b5d04427 100644 --- a/src/Text/Pandoc/Writers/FB2.hs +++ b/src/Text/Pandoc/Writers/FB2.hs @@ -3,7 +3,7 @@ {- | Module : Text.Pandoc.Writers.FB2 Copyright : Copyright (C) 2011-2012 Sergey Astanin - 2012-2020 John MacFarlane + 2012-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane @@ -19,29 +19,29 @@ FictionBook is an XML-based e-book format. For more information see: module Text.Pandoc.Writers.FB2 (writeFB2) where import Control.Monad (zipWithM) -import Control.Monad.Except (catchError) +import Control.Monad.Except (catchError, throwError) import Control.Monad.State.Strict (StateT, evalStateT, get, gets, lift, liftM, modify) import Data.ByteString.Base64 (encode) import Data.Char (isAscii, isControl, isSpace) import Data.Either (lefts, rights) import Data.List (intercalate) -import Data.Text (Text, pack) +import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL import qualified Data.Text.Encoding as TE import Network.HTTP (urlEncode) -import Text.XML.Light -import qualified Text.XML.Light as X -import qualified Text.XML.Light.Cursor as XC -import qualified Text.XML.Light.Input as XI +import Text.Pandoc.XML.Light as X import Text.Pandoc.Class.PandocMonad (PandocMonad, report) import qualified Text.Pandoc.Class.PandocMonad as P import Text.Pandoc.Definition +import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Logging import Text.Pandoc.Options (HTMLMathMethod (..), WriterOptions (..), def) import Text.Pandoc.Shared (capitalize, isURI, orderedListMarkers, makeSections, tshow, stringify) import Text.Pandoc.Writers.Shared (lookupMetaString, toLegacyTable) +import Data.Generics (everywhere, mkT) -- | Data to be written at the end of the document: -- (foot)notes, URLs, references, images. @@ -86,7 +86,7 @@ pandocToFB2 opts (Pandoc meta blocks) = do (imgs,missing) <- get >>= (lift . fetchImages . imagesToFetch) let body' = replaceImagesWithAlt missing body let fb2_xml = el "FictionBook" (fb2_attrs, [desc, body'] ++ notes ++ imgs) - return $ pack $ xml_head ++ showContent fb2_xml ++ "\n" + return $ xml_head <> showContent fb2_xml <> "\n" where xml_head = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" fb2_attrs = @@ -98,8 +98,8 @@ pandocToFB2 opts (Pandoc meta blocks) = do description :: PandocMonad m => Meta -> FBM m Content description meta' = do let genre = case lookupMetaString "genre" meta' of - "" -> el "genre" ("unrecognised" :: String) - s -> el "genre" (T.unpack s) + "" -> el "genre" ("unrecognised" :: Text) + s -> el "genre" s bt <- booktitle meta' let as = authors meta' dd <- docdate meta' @@ -110,7 +110,7 @@ description meta' = do Just (MetaInlines [Str s]) -> [el "lang" $ iso639 s] Just (MetaString s) -> [el "lang" $ iso639 s] _ -> [] - where iso639 = T.unpack . T.takeWhile (/= '-') -- Convert BCP 47 to ISO 639 + where iso639 = T.takeWhile (/= '-') -- Convert BCP 47 to ISO 639 let coverimage url = do let img = Image nullAttr mempty (url, "") im <- insertImage InlineImage img @@ -122,7 +122,7 @@ description meta' = do return $ el "description" [ el "title-info" (genre : (as ++ bt ++ annotation ++ dd ++ coverpage ++ lang)) - , el "document-info" [el "program-used" ("pandoc" :: String)] + , el "document-info" [el "program-used" ("pandoc" :: Text)] ] booktitle :: PandocMonad m => Meta -> FBM m [Content] @@ -135,15 +135,15 @@ authors meta' = cMap author (docAuthors meta') author :: [Inline] -> [Content] author ss = - let ws = words . cMap plain $ ss - email = el "email" <$> take 1 (filter ('@' `elem`) ws) - ws' = filter ('@' `notElem`) ws + let ws = T.words $ mconcat $ map plain ss + email = el "email" <$> take 1 (filter (T.any (=='@')) ws) + ws' = filter (not . T.any (== '@')) ws names = case ws' of [nickname] -> [ el "nickname" nickname ] [fname, lname] -> [ el "first-name" fname , el "last-name" lname ] (fname:rest) -> [ el "first-name" fname - , el "middle-name" (concat . init $ rest) + , el "middle-name" (T.concat . init $ rest) , el "last-name" (last rest) ] [] -> [] in list $ el "author" (names ++ email) @@ -204,7 +204,7 @@ renderFootnotes = do el "body" ([uattr "name" "notes"], map renderFN (reverse fns)) where renderFN (n, idstr, cs) = - let fn_texts = el "title" (el "p" (show n)) : cs + let fn_texts = el "title" (el "p" (tshow n)) : cs in el "section" ([uattr "id" idstr], fn_texts) -- | Fetch images and encode them for the FictionBook XML. @@ -280,7 +280,7 @@ isMimeType s = where types = ["text","image","audio","video","application","message","multipart"] valid c = isAscii c && not (isControl c) && not (isSpace c) && - c `notElem` ("()<>@,;:\\\"/[]?=" :: String) + c `notElem` ("()<>@,;:\\\"/[]?=" :: [Char]) footnoteID :: Int -> Text footnoteID i = "n" <> tshow i @@ -304,10 +304,13 @@ blockToXml (Para [Image atr alt (src,tgt)]) = insertImage NormalImage (Image atr alt (src,tit)) blockToXml (Para ss) = list . el "p" <$> cMapM toXml ss blockToXml (CodeBlock _ s) = return . spaceBeforeAfter . - map (el "p" . el "code" . T.unpack) . T.lines $ s + map (el "p" . el "code") . T.lines $ s blockToXml (RawBlock f str) = if f == Format "fb2" - then return $ XI.parseXML str + then + case parseXMLContents (TL.fromStrict str) of + Left msg -> throwError $ PandocXMLError "" msg + Right nds -> return nds else return [] blockToXml (Div _ bs) = cMapM blockToXml bs blockToXml (BlockQuote bs) = list . el "cite" <$> cMapM blockToXml bs @@ -341,11 +344,11 @@ blockToXml (Table _ blkCapt specs thead tbody tfoot) = do c <- el "emphasis" <$> cMapM toXml caption return [el "table" (hd <> bd), el "p" c] where - mkrow :: PandocMonad m => String -> [[Block]] -> [Alignment] -> FBM m Content + mkrow :: PandocMonad m => Text -> [[Block]] -> [Alignment] -> FBM m Content mkrow tag cells aligns' = el "tr" <$> mapM (mkcell tag) (zip cells aligns') -- - mkcell :: PandocMonad m => String -> ([Block], Alignment) -> FBM m Content + mkcell :: PandocMonad m => Text -> ([Block], Alignment) -> FBM m Content mkcell tag (cell, align) = do cblocks <- cMapM blockToXml cell return $ el tag ([align_attr align], cblocks) @@ -419,7 +422,7 @@ toXml (Quoted DoubleQuote ss) = do inner <- cMapM toXml ss return $ [txt "“"] ++ inner ++ [txt "”"] toXml (Cite _ ss) = cMapM toXml ss -- FIXME: support citation styles -toXml (Code _ s) = return [el "code" $ T.unpack s] +toXml (Code _ s) = return [el "code" s] toXml Space = return [txt " "] toXml SoftBreak = return [txt "\n"] toXml LineBreak = return [txt "\n"] @@ -451,7 +454,7 @@ insertMath immode formula = do let imgurl = url <> T.pack (urlEncode $ T.unpack formula) let img = Image nullAttr alt (imgurl, "") insertImage immode img - _ -> return [el "code" $ T.unpack formula] + _ -> return [el "code" formula] insertImage :: PandocMonad m => ImageMode -> Inline -> FBM m [Content] insertImage immode (Image _ alt (url,ttl)) = do @@ -466,31 +469,16 @@ insertImage immode (Image _ alt (url,ttl)) = do el "image" $ [ attr ("l","href") ("#" <> fname) , attr ("l","type") (tshow immode) - , uattr "alt" (T.pack $ cMap plain alt) ] + , uattr "alt" (mconcat $ map plain alt) ] ++ ttlattr insertImage _ _ = error "unexpected inline instead of image" replaceImagesWithAlt :: [Text] -> Content -> Content -replaceImagesWithAlt missingHrefs body = - let cur = XC.fromContent body - cur' = replaceAll cur - in XC.toTree . XC.root $ cur' +replaceImagesWithAlt missingHrefs = everywhere (mkT go) where - -- - replaceAll :: XC.Cursor -> XC.Cursor - replaceAll c = - let n = XC.current c - c' = if isImage n && isMissing n - then XC.modifyContent replaceNode c - else c - in case XC.nextDF c' of - (Just cnext) -> replaceAll cnext - Nothing -> c' -- end of document - -- - isImage :: Content -> Bool - isImage (Elem e) = elName e == uname "image" - isImage _ = False - -- + go c = if isMissing c + then replaceNode c + else c isMissing (Elem img@Element{}) = let imgAttrs = elAttribs img badAttrs = map (attr ("l","href")) missingHrefs @@ -500,18 +488,18 @@ replaceImagesWithAlt missingHrefs body = replaceNode :: Content -> Content replaceNode n@(Elem img@Element{}) = let attrs = elAttribs img - alt = getAttrVal attrs (uname "alt") + alt = getAttrVal attrs (unqual "alt") imtype = getAttrVal attrs (qname "l" "type") in case (alt, imtype) of (Just alt', Just imtype') -> - if imtype' == show NormalImage + if imtype' == tshow NormalImage then el "p" alt' - else txt $ T.pack alt' - (Just alt', Nothing) -> txt $ T.pack alt' -- no type attribute + else txt alt' + (Just alt', Nothing) -> txt alt' -- no type attribute _ -> n -- don't replace if alt text is not found replaceNode n = n -- - getAttrVal :: [X.Attr] -> QName -> Maybe String + getAttrVal :: [X.Attr] -> QName -> Maybe Text getAttrVal attrs name = case filter ((name ==) . attrKey) attrs of (a:_) -> Just (attrVal a) @@ -519,7 +507,7 @@ replaceImagesWithAlt missingHrefs body = -- | Wrap all inlines with an XML tag (given its unqualified name). -wrap :: PandocMonad m => String -> [Inline] -> FBM m Content +wrap :: PandocMonad m => Text -> [Inline] -> FBM m Content wrap tagname inlines = el tagname `liftM` cMapM toXml inlines -- " Create a singleton list. @@ -527,31 +515,31 @@ list :: a -> [a] list = (:[]) -- | Convert an 'Inline' to plaintext. -plain :: Inline -> String -plain (Str s) = T.unpack s -plain (Emph ss) = cMap plain ss -plain (Underline ss) = cMap plain ss -plain (Span _ ss) = cMap plain ss -plain (Strong ss) = cMap plain ss -plain (Strikeout ss) = cMap plain ss -plain (Superscript ss) = cMap plain ss -plain (Subscript ss) = cMap plain ss -plain (SmallCaps ss) = cMap plain ss -plain (Quoted _ ss) = cMap plain ss -plain (Cite _ ss) = cMap plain ss -- FIXME -plain (Code _ s) = T.unpack s +plain :: Inline -> Text +plain (Str s) = s +plain (Emph ss) = mconcat $ map plain ss +plain (Underline ss) = mconcat $ map plain ss +plain (Span _ ss) = mconcat $ map plain ss +plain (Strong ss) = mconcat $ map plain ss +plain (Strikeout ss) = mconcat $ map plain ss +plain (Superscript ss) = mconcat $ map plain ss +plain (Subscript ss) = mconcat $ map plain ss +plain (SmallCaps ss) = mconcat $ map plain ss +plain (Quoted _ ss) = mconcat $ map plain ss +plain (Cite _ ss) = mconcat $ map plain ss -- FIXME +plain (Code _ s) = s plain Space = " " plain SoftBreak = " " plain LineBreak = "\n" -plain (Math _ s) = T.unpack s +plain (Math _ s) = s plain (RawInline _ _) = "" -plain (Link _ text (url,_)) = concat (map plain text ++ [" <", T.unpack url, ">"]) -plain (Image _ alt _) = cMap plain alt +plain (Link _ text (url,_)) = mconcat (map plain text ++ [" <", url, ">"]) +plain (Image _ alt _) = mconcat $ map plain alt plain (Note _) = "" -- FIXME -- | Create an XML element. el :: (Node t) - => String -- ^ unqualified element name + => Text -- ^ unqualified element name -> t -- ^ node contents -> Content -- ^ XML content el name cs = Elem $ unode name cs @@ -564,22 +552,18 @@ spaceBeforeAfter cs = -- | Create a plain-text XML content. txt :: Text -> Content -txt s = Text $ CData CDataText (T.unpack s) Nothing +txt s = Text $ CData CDataText s Nothing -- | Create an XML attribute with an unqualified name. -uattr :: String -> Text -> Text.XML.Light.Attr -uattr name = Attr (uname name) . T.unpack +uattr :: Text -> Text -> X.Attr +uattr name = Attr (unqual name) -- | Create an XML attribute with a qualified name from given namespace. -attr :: (String, String) -> Text -> Text.XML.Light.Attr -attr (ns, name) = Attr (qname ns name) . T.unpack - --- | Unqualified name -uname :: String -> QName -uname name = QName name Nothing Nothing +attr :: (Text, Text) -> Text -> X.Attr +attr (ns, name) = Attr (qname ns name) -- | Qualified name -qname :: String -> String -> QName +qname :: Text -> Text -> QName qname ns name = QName name Nothing (Just ns) -- | Abbreviation for 'concatMap'. diff --git a/src/Text/Pandoc/Writers/GridTable.hs b/src/Text/Pandoc/Writers/GridTable.hs new file mode 100644 index 000000000..bc468febc --- /dev/null +++ b/src/Text/Pandoc/Writers/GridTable.hs @@ -0,0 +1,157 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE TupleSections #-} + +{- | +Module : Text.Pandoc.Writers.GridTable +Copyright : © 2020-2021 Albert Krewinkel +License : GNU GPL, version 2 or above + +Maintainer : Albert Krewinkel <albert@zeitkraut.de> + +Grid representation of pandoc tables. +-} +module Text.Pandoc.Writers.GridTable + ( Table (..) + , GridCell (..) + , RowIndex (..) + , ColIndex (..) + , CellIndex + , Part (..) + , toTable + , rowArray + ) where + +import Control.Monad (forM_) +import Control.Monad.ST +import Data.Array +import Data.Array.MArray +import Data.Array.ST +import Data.Maybe (listToMaybe) +import Data.STRef +import Text.Pandoc.Definition hiding (Table) +import qualified Text.Pandoc.Builder as B + +-- | A grid cell contains either a real table cell, or is the +-- continuation of a column or row-spanning cell. In the latter case, +-- the index of the continued cell is provided. +data GridCell + = ContentCell Attr Alignment RowSpan ColSpan [Block] + | ContinuationCell CellIndex + deriving (Show) + +-- | Row index in a table part. +newtype RowIndex = RowIndex Int deriving (Enum, Eq, Ix, Ord, Show) +-- | Column index in a table part. +newtype ColIndex = ColIndex Int deriving (Enum, Eq, Ix, Ord, Show) + +-- | Index to a cell in a table part. +type CellIndex = (RowIndex, ColIndex) + +-- | Cells are placed on a grid. Row attributes are stored in a separate +-- array. +data Part = Part + { partAttr :: Attr + , partCellArray :: Array (RowIndex,ColIndex) GridCell + , partRowAttrs :: Array RowIndex Attr + } + +data Table = Table + { tableAttr :: Attr + , tableCaption :: Caption + , tableColSpecs :: Array ColIndex ColSpec + , tableRowHeads :: RowHeadColumns + , tableHead :: Part + , tableBodies :: [Part] + , tableFoot :: Part + } + +toTable + :: B.Attr + -> B.Caption + -> [B.ColSpec] + -> B.TableHead + -> [B.TableBody] + -> B.TableFoot + -> Table +toTable attr caption colSpecs thead tbodies tfoot = + Table attr caption colSpecs' rowHeads thGrid tbGrids tfGrid + where + colSpecs' = listArray (ColIndex 1, ColIndex $ length colSpecs) colSpecs + rowHeads = case listToMaybe tbodies of + Nothing -> RowHeadColumns 0 + Just (TableBody _attr rowHeadCols _headerRows _rows) -> rowHeadCols + thGrid = let (TableHead headAttr rows) = thead + in rowsToPart headAttr rows + tbGrids = map bodyToGrid tbodies + tfGrid = let (TableFoot footAttr rows) = tfoot + in rowsToPart footAttr rows + bodyToGrid (TableBody bodyAttr _rowHeadCols headRows rows) = + rowsToPart bodyAttr (headRows ++ rows) + +data BuilderCell + = FilledCell GridCell + | FreeCell + +fromBuilderCell :: BuilderCell -> GridCell +fromBuilderCell = \case + FilledCell c -> c + FreeCell -> error "Found an unassigned cell." + +rowsToPart :: Attr -> [B.Row] -> Part +rowsToPart attr = \case + [] -> Part + attr + (listArray ((RowIndex 1, ColIndex 1), (RowIndex 0, ColIndex 0)) []) + (listArray (RowIndex 1, RowIndex 0) []) + rows@(Row _attr firstRow:_) -> + let nrows = length rows + ncols = sum $ map (\(Cell _ _ _ (ColSpan cs) _) -> cs) firstRow + gbounds = ((RowIndex 1, ColIndex 1), (RowIndex nrows, ColIndex ncols)) + mutableGrid :: ST s (STArray s CellIndex GridCell) + mutableGrid = do + grid <- newArray gbounds FreeCell + ridx <- newSTRef (RowIndex 1) + forM_ rows $ \(Row _attr cells) -> do + cidx <- newSTRef (ColIndex 1) + forM_ cells $ \(Cell cellAttr align rs cs blks) -> do + ridx' <- readSTRef ridx + let nextFreeInRow colindex@(ColIndex c) = do + readArray grid (ridx', colindex) >>= \case + FreeCell -> pure colindex + _ -> nextFreeInRow $ ColIndex (c + 1) + cidx' <- readSTRef cidx >>= nextFreeInRow + writeArray grid (ridx', cidx') . FilledCell $ + ContentCell cellAttr align rs cs blks + forM_ (continuationIndices ridx' cidx' rs cs) $ \idx -> do + writeArray grid idx . FilledCell $ + ContinuationCell (ridx', cidx') + -- go to new column + writeSTRef cidx cidx' + -- go to next row + modifySTRef ridx (incrRowIndex 1) + -- Swap BuilderCells with normal GridCells. + mapArray fromBuilderCell grid + in Part + { partCellArray = runSTArray mutableGrid + , partRowAttrs = listArray (RowIndex 1, RowIndex nrows) $ + map (\(Row rowAttr _) -> rowAttr) rows + , partAttr = attr + } + +continuationIndices :: RowIndex -> ColIndex -> RowSpan -> ColSpan -> [CellIndex] +continuationIndices (RowIndex ridx) (ColIndex cidx) rowspan colspan = + let (RowSpan rs) = rowspan + (ColSpan cs) = colspan + in [ (RowIndex r, ColIndex c) | r <- [ridx..(ridx + rs - 1)] + , c <- [cidx..(cidx + cs - 1)] + , (r, c) /= (ridx, cidx)] + +rowArray :: RowIndex -> Array CellIndex GridCell -> Array ColIndex GridCell +rowArray ridx grid = + let ((_minRidx, minCidx), (_maxRidx, maxCidx)) = bounds grid + in ixmap (minCidx, maxCidx) (ridx,) grid + +incrRowIndex :: RowSpan -> RowIndex -> RowIndex +incrRowIndex (RowSpan n) (RowIndex r) = RowIndex $ r + n diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index cba6b7d1c..6f91d1965 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -6,7 +6,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Writers.HTML - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -31,9 +31,9 @@ module Text.Pandoc.Writers.HTML ( import Control.Monad.Identity (runIdentity) import Control.Monad.State.Strict import Data.Char (ord) -import Data.List (intercalate, intersperse, partition, delete, (\\)) +import Data.List (intercalate, intersperse, partition, delete, (\\), foldl') import Data.List.NonEmpty (NonEmpty((:|))) -import Data.Maybe (fromMaybe, isJust, isNothing, mapMaybe) +import Data.Maybe (fromMaybe, isJust, isNothing) import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T @@ -265,6 +265,8 @@ pandocToHtml opts (Pandoc meta blocks) = do let stringifyHTML = escapeStringForXML . stringify let authsMeta = map stringifyHTML $ docAuthors meta let dateMeta = stringifyHTML $ docDate meta + let descriptionMeta = escapeStringForXML $ + lookupMetaString "description" meta slideVariant <- gets stSlideVariant let sects = adjustNumbers opts $ makeSections (writerNumberSections opts) Nothing $ @@ -354,6 +356,52 @@ pandocToHtml opts (Pandoc meta blocks) = do PlainMath -> defField "displaymath-css" True WebTeX _ -> defField "displaymath-css" True _ -> id) . + (if slideVariant == RevealJsSlides + then -- set boolean options explicitly, since + -- template can't distinguish False/undefined + defField "controls" True . + defField "controlsTutorial" True . + defField "controlsLayout" ("bottom-right" :: Text) . + defField "controlsBackArrows" ("faded" :: Text) . + defField "progress" True . + defField "slideNumber" False . + defField "showSlideNumber" ("all" :: Text) . + defField "hashOneBasedIndex" False . + defField "hash" False . + defField "respondToHashChanges" True . + defField "history" False . + defField "keyboard" True . + defField "overview" True . + defField "disableLayout" False . + defField "center" True . + defField "touch" True . + defField "loop" False . + defField "rtl" False . + defField "navigationMode" ("default" :: Text) . + defField "shuffle" False . + defField "fragments" True . + defField "fragmentInURL" True . + defField "embedded" False . + defField "help" True . + defField "pause" True . + defField "showNotes" False . + defField "autoPlayMedia" ("null" :: Text) . + defField "preloadIframes" ("null" :: Text) . + defField "autoSlide" ("0" :: Text) . + defField "autoSlideStoppable" True . + defField "autoSlideMethod" ("null" :: Text) . + defField "defaultTiming" ("null" :: Text) . + defField "mouseWheel" False . + defField "display" ("block" :: Text) . + defField "hideInactiveCursor" True . + defField "hideCursorTime" ("5000" :: Text) . + defField "previewLinks" False . + defField "transition" ("slide" :: Text) . + defField "transitionSpeed" ("default" :: Text) . + defField "backgroundTransition" ("fade" :: Text) . + defField "viewDistance" ("3" :: Text) . + defField "mobileViewDistance" ("2" :: Text) + else id) . defField "document-css" (isNothing mCss && slideVariant == NoSlides) . defField "quotes" (stQuotes st) . -- for backwards compatibility we populate toc @@ -364,6 +412,7 @@ pandocToHtml opts (Pandoc meta blocks) = do defField "author-meta" authsMeta . maybe id (defField "date-meta") (normalizeDate dateMeta) . + defField "description-meta" descriptionMeta . defField "pagetitle" (stringifyHTML . docTitle $ meta) . defField "idprefix" (writerIdentifierPrefix opts) . @@ -553,30 +602,35 @@ tagWithAttributes opts html5 selfClosing tagname attr = addAttrs :: PandocMonad m => WriterOptions -> Attr -> Html -> StateT WriterState m Html -addAttrs opts attr h = foldl (!) h <$> attrsToHtml opts attr +addAttrs opts attr h = foldl' (!) h <$> attrsToHtml opts attr toAttrs :: PandocMonad m => [(Text, Text)] -> StateT WriterState m [Attribute] toAttrs kvs = do html5 <- gets stHtml5 mbEpubVersion <- gets stEPUBVersion - return $ mapMaybe (\(x,y) -> - if html5 - then - if x `Set.member` (html5Attributes <> rdfaAttributes) - || T.any (== ':') x -- e.g. epub: namespace - || "data-" `T.isPrefixOf` x - || "aria-" `T.isPrefixOf` x - then Just $ customAttribute (textTag x) (toValue y) - else Just $ customAttribute (textTag ("data-" <> x)) - (toValue y) - else - if mbEpubVersion == Just EPUB2 && - not (x `Set.member` (html4Attributes <> rdfaAttributes) || - "xml:" `T.isPrefixOf` x) - then Nothing - else Just $ customAttribute (textTag x) (toValue y)) - kvs + reverse . snd <$> foldM (go html5 mbEpubVersion) (Set.empty, []) kvs + where + go html5 mbEpubVersion (keys, attrs) (k,v) = do + if k `Set.member` keys + then do + report $ DuplicateAttribute k v + return (keys, attrs) + else return (Set.insert k keys, addAttr html5 mbEpubVersion k v attrs) + addAttr html5 mbEpubVersion x y + | html5 + = if x `Set.member` (html5Attributes <> rdfaAttributes) + || T.any (== ':') x -- e.g. epub: namespace + || "data-" `T.isPrefixOf` x + || "aria-" `T.isPrefixOf` x + then (customAttribute (textTag x) (toValue y) :) + else (customAttribute (textTag ("data-" <> x)) (toValue y) :) + | mbEpubVersion == Just EPUB2 + , not (x `Set.member` (html4Attributes <> rdfaAttributes) || + "xml:" `T.isPrefixOf` x) + = id + | otherwise + = (customAttribute (textTag x) (toValue y) :) attrsToHtml :: PandocMonad m => WriterOptions -> Attr -> StateT WriterState m [Attribute] @@ -617,17 +671,20 @@ dimensionsToAttrList attr = consolidateStyles $ go Width ++ go Height figure :: PandocMonad m => WriterOptions -> Attr -> [Inline] -> (Text, Text) -> StateT WriterState m Html -figure opts attr txt (s,tit) = do +figure opts attr@(_, _, attrList) txt (s,tit) = do html5 <- gets stHtml5 -- Screen-readers will normally read the @alt@ text and the figure; we -- want to avoid them reading the same text twice. With HTML5 we can -- use aria-hidden for the caption; with HTML4, we use an empty -- alt-text instead. + -- When the alt text differs from the caption both should be read. let alt = if html5 then txt else [Str ""] let tocapt = if html5 - then H5.figcaption ! - H5.customAttribute (textTag "aria-hidden") - (toValue @Text "true") + then (H5.figcaption !) $ + if isJust (lookup "alt" attrList) + then mempty + else H5.customAttribute (textTag "aria-hidden") + (toValue @Text "true") else H.p ! A.class_ "caption" img <- inlineToHtml opts (Image attr alt (s,tit)) capt <- if null txt @@ -707,12 +764,12 @@ blockToHtml opts (Div (ident, "section":dclasses, dkvs) let fragmentClass = case slideVariant of RevealJsSlides -> "fragment" _ -> "incremental" - let inDiv zs = RawBlock (Format "html") ("<div class=\"" + let inDiv' zs = RawBlock (Format "html") ("<div class=\"" <> fragmentClass <> "\">") : (zs ++ [RawBlock (Format "html") "</div>"]) let breakOnPauses zs = case splitBy isPause zs of [] -> [] - y:ys -> y ++ concatMap inDiv ys + y:ys -> y ++ concatMap inDiv' ys let (titleBlocks, innerSecs) = if titleSlide -- title slides have no content of their own @@ -774,9 +831,10 @@ blockToHtml opts (Div attr@(ident, classes, kvs') bs) = do lookup "entry-spacing" kvs' >>= safeRead } let isCslBibEntry = "csl-entry" `elem` classes - let kvs = [(k,v) | (k,v) <- kvs', k /= "width"] ++ - [("style", "width:" <> w <> ";") | "column" `elem` classes, - ("width", w) <- kvs'] ++ + let kvs = [(k,v) | (k,v) <- kvs' + , k /= "width" || "column" `notElem` classes] ++ + [("style", "width:" <> w <> ";") | "column" `elem` classes + , ("width", w) <- kvs'] ++ [("role", "doc-bibliography") | isCslBibBody && html5] ++ [("role", "doc-biblioentry") | isCslBibEntry && html5] let speakerNotes = "notes" `elem` classes @@ -790,14 +848,17 @@ blockToHtml opts (Div attr@(ident, classes, kvs') bs) = do classes' = case slideVariant of NoSlides -> classes _ -> filter (\k -> k /= "incremental" && k /= "nonincremental") classes + let paraToPlain (Para ils) = Plain ils + paraToPlain x = x + let bs' = if "csl-entry" `elem` classes' + then walk paraToPlain bs + else bs contents <- if "columns" `elem` classes' then -- we don't use blockListToHtml because it inserts -- a newline between the column divs, which throws -- off widths! see #4028 - mconcat <$> mapM (blockToHtml opts) bs - else if isCslBibEntry - then mconcat <$> mapM (cslEntryToHtml opts') bs - else blockListToHtml opts' bs + mconcat <$> mapM (blockToHtml opts) bs' + else blockListToHtml opts' bs' let contents' = nl opts >> contents >> nl opts let (divtag, classes'') = if html5 && "section" `elem` classes' then (H5.section, filter (/= "section") classes') @@ -883,7 +944,7 @@ blockToHtml opts (BlockQuote blocks) = do else do contents <- blockListToHtml opts blocks return $ H.blockquote $ nl opts >> contents >> nl opts -blockToHtml opts (Header level attr@(_,classes,kvs) lst) = do +blockToHtml opts (Header level (ident,classes,kvs) lst) = do contents <- inlineListToHtml opts lst let secnum = fromMaybe mempty $ lookup "number" kvs let contents' = if writerNumberSections opts && not (T.null secnum) @@ -891,7 +952,13 @@ blockToHtml opts (Header level attr@(_,classes,kvs) lst) = do then (H.span ! A.class_ "header-section-number" $ toHtml secnum) >> strToHtml " " >> contents else contents - addAttrs opts attr + html5 <- gets stHtml5 + let kvs' = if html5 + then kvs + else [ (k, v) | (k, v) <- kvs + , k `elem` (["lang", "dir", "title", "style" + , "align"] ++ intrinsicEventsHTML4)] + addAttrs opts (ident,classes,kvs') $ case level of 1 -> H.h1 contents' 2 -> H.h2 contents' @@ -927,7 +994,7 @@ blockToHtml opts (OrderedList (startnum, numstyle, _) lst) = do numstyle'] else []) l <- ordList opts contents - return $ foldl (!) l attribs + return $ foldl' (!) l attribs blockToHtml opts (DefinitionList lst) = do contents <- mapM (\(term, defs) -> do term' <- liftM H.dt $ inlineListToHtml opts term @@ -1225,6 +1292,10 @@ inlineToHtml opts inline = do LineBreak -> return $ do if html5 then H5.br else H.br strToHtml "\n" + (Span ("",[cls],[]) ils) + | cls == "csl-block" || cls == "csl-left-margin" || + cls == "csl-right-inline" || cls == "csl-indent" + -> inlineListToHtml opts ils >>= inDiv cls (Span (id',classes,kvs) ils) -> let spanLikeTag = case classes of @@ -1377,7 +1448,7 @@ inlineToHtml opts inline = do return $ if T.null tit then link' else link' ! A.title (toValue tit) - (Image attr txt (s,tit)) -> do + (Image attr@(_, _, attrList) txt (s, tit)) -> do let alternate = stringify txt slideVariant <- gets stSlideVariant let isReveal = slideVariant == RevealJsSlides @@ -1390,7 +1461,8 @@ inlineToHtml opts inline = do [A.title $ toValue tit | not (T.null tit)] ++ attrs imageTag = (if html5 then H5.img else H.img - , [A.alt $ toValue alternate | not (null txt)] ) + , [A.alt $ toValue alternate | not (null txt) && + isNothing (lookup "alt" attrList)] ) mediaTag tg fallbackTxt = let linkTxt = if null txt then fallbackTxt @@ -1404,7 +1476,7 @@ inlineToHtml opts inline = do Just "audio" -> mediaTag H5.audio "Audio" Just _ -> (H5.embed, []) _ -> imageTag - return $ foldl (!) tag $ attributes ++ specAttrs + return $ foldl' (!) tag $ attributes ++ specAttrs -- note: null title included, as in Markdown.pl (Note contents) -> do notes <- gets stNotes @@ -1457,11 +1529,15 @@ blockListToNote opts ref blocks = do else let lastBlock = last blocks otherBlocks = init blocks in case lastBlock of - (Para lst) -> otherBlocks ++ + Para [Image _ _ (_,tit)] + | "fig:" `T.isPrefixOf` tit + -> otherBlocks ++ [lastBlock, + Plain backlink] + Para lst -> otherBlocks ++ [Para (lst ++ backlink)] - (Plain lst) -> otherBlocks ++ + Plain lst -> otherBlocks ++ [Plain (lst ++ backlink)] - _ -> otherBlocks ++ [lastBlock, + _ -> otherBlocks ++ [lastBlock, Plain backlink] contents <- blockListToHtml opts blocks' let noteItem = H.li ! prefixedId opts ("fn" <> ref) $ contents @@ -1474,22 +1550,12 @@ blockListToNote opts ref blocks = do _ -> noteItem return $ nl opts >> noteItem' -cslEntryToHtml :: PandocMonad m - => WriterOptions - -> Block - -> StateT WriterState m Html -cslEntryToHtml opts (Para xs) = do +inDiv :: PandocMonad m=> Text -> Html -> StateT WriterState m Html +inDiv cls x = do html5 <- gets stHtml5 - let inDiv :: Text -> Html -> Html - inDiv cls x = (if html5 then H5.div else H.div) - x ! A.class_ (toValue cls) - let go (Span ("",[cls],[]) ils) - | cls == "csl-block" || cls == "csl-left-margin" || - cls == "csl-right-inline" || cls == "csl-indent" - = inDiv cls <$> inlineListToHtml opts ils - go il = inlineToHtml opts il - mconcat <$> mapM go xs -cslEntryToHtml opts x = blockToHtml opts x + return $ + (if html5 then H5.div else H.div) + x ! A.class_ (toValue cls) isMathEnvironment :: Text -> Bool isMathEnvironment s = "\\begin{" `T.isPrefixOf` s && @@ -1529,6 +1595,12 @@ allowsMathEnvironments MathML = True allowsMathEnvironments (WebTeX _) = True allowsMathEnvironments _ = False +-- | List of intrinsic event attributes allowed on all elements in HTML4. +intrinsicEventsHTML4 :: [Text] +intrinsicEventsHTML4 = + [ "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover" + , "onmouseout", "onmouseout", "onkeypress", "onkeydown", "onkeyup"] + isRawHtml :: PandocMonad m => Format -> StateT WriterState m Bool isRawHtml f = do html5 <- gets stHtml5 diff --git a/src/Text/Pandoc/Writers/Haddock.hs b/src/Text/Pandoc/Writers/Haddock.hs index aaa19ed07..75e14714b 100644 --- a/src/Text/Pandoc/Writers/Haddock.hs +++ b/src/Text/Pandoc/Writers/Haddock.hs @@ -15,6 +15,7 @@ Haddock: <http://www.haskell.org/haddock/doc/html/> -} module Text.Pandoc.Writers.Haddock (writeHaddock) where import Control.Monad.State.Strict +import Data.Char (isAlphaNum) import Data.Default import Data.Text (Text) import qualified Data.Text as T @@ -71,8 +72,18 @@ notesToHaddock opts notes = -- | Escape special characters for Haddock. escapeString :: Text -> Text -escapeString = escapeStringUsing haddockEscapes - where haddockEscapes = backslashEscapes "\\/'`\"@<" +escapeString t + | T.all isAlphaNum t = t + | otherwise = T.concatMap escChar t + where + escChar '\\' = "\\\\" + escChar '/' = "\\/" + escChar '\'' = "\\'" + escChar '`' = "\\`" + escChar '"' = "\\\"" + escChar '@' = "\\@" + escChar '<' = "\\<" + escChar c = T.singleton c -- | Convert Pandoc block element to haddock. blockToHaddock :: PandocMonad m diff --git a/src/Text/Pandoc/Writers/ICML.hs b/src/Text/Pandoc/Writers/ICML.hs index dcf5acfef..c254fbc58 100644 --- a/src/Text/Pandoc/Writers/ICML.hs +++ b/src/Text/Pandoc/Writers/ICML.hs @@ -621,7 +621,12 @@ imageICML opts style attr (src, _) = do image = inTags True "Image" [("Self","ue6"), ("ItemTransform", scale<>" -"<>hw<>" -"<>hh)] $ vcat [ - inTags True "Properties" [] $ inTags True "Profile" [("type","string")] $ text "$ID/Embedded" + inTags True "Properties" [] $ vcat [ + inTags True "Profile" [("type","string")] $ text "$ID/Embedded" + , selfClosingTag "GraphicBounds" [("Left","0"), ("Top","0") + , ("Right", showFl $ ow*ow / imgWidth) + , ("Bottom", showFl $ oh*oh / imgHeight)] + ] , selfClosingTag "Link" [("Self", "ueb"), ("LinkResourceURI", src')] ] doc = inTags True "CharacterStyleRange" attrs diff --git a/src/Text/Pandoc/Writers/Ipynb.hs b/src/Text/Pandoc/Writers/Ipynb.hs index d01d5a7e5..2613851c5 100644 --- a/src/Text/Pandoc/Writers/Ipynb.hs +++ b/src/Text/Pandoc/Writers/Ipynb.hs @@ -2,7 +2,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Writers.Ipynb - Copyright : Copyright (C) 2019-2020 John MacFarlane + Copyright : Copyright (C) 2019-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Writers/JATS.hs b/src/Text/Pandoc/Writers/JATS.hs index 7058a4557..9db8723d1 100644 --- a/src/Text/Pandoc/Writers/JATS.hs +++ b/src/Text/Pandoc/Writers/JATS.hs @@ -1,9 +1,10 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} -{-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Writers.JATS - Copyright : Copyright (C) 2017-2020 John MacFarlane + Copyright : 2017-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -20,15 +21,17 @@ module Text.Pandoc.Writers.JATS , writeJatsPublishing , writeJatsArticleAuthoring ) where +import Control.Applicative ((<|>)) import Control.Monad.Reader import Control.Monad.State import Data.Generics (everywhere, mkT) import Data.List (partition) import qualified Data.Map as M -import Data.Maybe (fromMaybe) +import Data.Maybe (fromMaybe, listToMaybe) import Data.Time (toGregorian, Day, parseTimeM, defaultTimeLocale, formatTime) import qualified Data.Text as T import Data.Text (Text) +import Text.Pandoc.Citeproc (getReferences) import Text.Pandoc.Class.PandocMonad (PandocMonad, report) import Text.Pandoc.Definition import Text.Pandoc.Highlighting (languages, languagesByExtension) @@ -40,6 +43,7 @@ import Text.DocLayout import Text.Pandoc.Shared import Text.Pandoc.Templates (renderTemplate) import Text.DocTemplates (Context(..), Val(..)) +import Text.Pandoc.Writers.JATS.References (referencesToJATS) import Text.Pandoc.Writers.JATS.Table (tableToJATS) import Text.Pandoc.Writers.JATS.Types import Text.Pandoc.Writers.Math @@ -71,15 +75,19 @@ writeJATS = writeJatsArchiving -- | Convert a @'Pandoc'@ document to JATS. writeJats :: PandocMonad m => JATSTagSet -> WriterOptions -> Pandoc -> m Text -writeJats tagSet opts d = - runReaderT (evalStateT (docToJATS opts d) initialState) - environment - where initialState = JATSState { jatsNotes = [] } - environment = JATSEnv +writeJats tagSet opts d = do + refs <- if extensionEnabled Ext_element_citations $ writerExtensions opts + then getReferences Nothing d + else pure [] + let environment = JATSEnv { jatsTagSet = tagSet , jatsInlinesWriter = inlinesToJATS - , jatsBlockWriter = blockToJATS + , jatsBlockWriter = wrappedBlocksToJATS + , jatsReferences = refs } + let initialState = JATSState { jatsNotes = [] } + runReaderT (evalStateT (docToJATS opts d) initialState) + environment -- | Convert Pandoc document to string in JATS format. docToJATS :: PandocMonad m => WriterOptions -> Pandoc -> JATS m Text @@ -168,13 +176,15 @@ plainToPara x = x -- | Convert a list of pairs of terms and definitions into a list of -- JATS varlistentrys. deflistItemsToJATS :: PandocMonad m - => WriterOptions -> [([Inline],[[Block]])] -> JATS m (Doc Text) + => WriterOptions + -> [([Inline],[[Block]])] -> JATS m (Doc Text) deflistItemsToJATS opts items = vcat <$> mapM (uncurry (deflistItemToJATS opts)) items -- | Convert a term and a list of blocks into a JATS varlistentry. deflistItemToJATS :: PandocMonad m - => WriterOptions -> [Inline] -> [[Block]] -> JATS m (Doc Text) + => WriterOptions + -> [Inline] -> [[Block]] -> JATS m (Doc Text) deflistItemToJATS opts term defs = do term' <- inlinesToJATS opts term def' <- wrappedBlocksToJATS (not . isPara) @@ -186,7 +196,8 @@ deflistItemToJATS opts term defs = do -- | Convert a list of lists of blocks to a list of JATS list items. listItemsToJATS :: PandocMonad m - => WriterOptions -> Maybe [Text] -> [[Block]] -> JATS m (Doc Text) + => WriterOptions + -> Maybe [Text] -> [[Block]] -> JATS m (Doc Text) listItemsToJATS opts markers items = case markers of Nothing -> vcat <$> mapM (listItemToJATS opts Nothing) items @@ -194,12 +205,13 @@ listItemsToJATS opts markers items = -- | Convert a list of blocks into a JATS list item. listItemToJATS :: PandocMonad m - => WriterOptions -> Maybe Text -> [Block] -> JATS m (Doc Text) + => WriterOptions + -> Maybe Text -> [Block] -> JATS m (Doc Text) listItemToJATS opts mbmarker item = do contents <- wrappedBlocksToJATS (not . isParaOrList) opts (walk demoteHeaderAndRefs item) return $ inTagsIndented "list-item" $ - maybe empty (\lbl -> inTagsSimple "label" (text $ T.unpack lbl)) mbmarker + maybe empty (inTagsSimple "label" . text . T.unpack) mbmarker $$ contents imageMimeType :: Text -> [(Text, Text)] -> (Text, Text) @@ -213,33 +225,36 @@ imageMimeType src kvs = (T.drop 1 . T.dropWhile (/='/') <$> mbMT) in (maintype, subtype) -languageFor :: [Text] -> Text -languageFor classes = +languageFor :: WriterOptions -> [Text] -> Text +languageFor opts classes = case langs of (l:_) -> escapeStringForXML l [] -> "" - where isLang l = T.toLower l `elem` map T.toLower languages + where + syntaxMap = writerSyntaxMap opts + isLang l = T.toLower l `elem` map T.toLower (languages syntaxMap) langsFrom s = if isLang s then [s] - else languagesByExtension . T.toLower $ s + else (languagesByExtension syntaxMap) . T.toLower $ s langs = concatMap langsFrom classes -codeAttr :: Attr -> (Text, [(Text, Text)]) -codeAttr (ident,classes,kvs) = (lang, attr) +codeAttr :: WriterOptions -> Attr -> (Text, [(Text, Text)]) +codeAttr opts (ident,classes,kvs) = (lang, attr) where - attr = [("id",ident) | not (T.null ident)] ++ + attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("language",lang) | not (T.null lang)] ++ [(k,v) | (k,v) <- kvs, k `elem` ["code-type", "code-version", "executable", "language-version", "orientation", "platforms", "position", "specific-use"]] - lang = languageFor classes + lang = languageFor opts classes -- | Convert a Pandoc block element to JATS. blockToJATS :: PandocMonad m => WriterOptions -> Block -> JATS m (Doc Text) blockToJATS _ Null = return empty blockToJATS opts (Div (id',"section":_,kvs) (Header _lvl _ ils : xs)) = do - let idAttr = [("id", writerIdentifierPrefix opts <> id') | not (T.null id')] + let idAttr = [ ("id", writerIdentifierPrefix opts <> escapeNCName id') + | not (T.null id')] let otherAttrs = ["sec-type", "specific-use"] let attribs = idAttr ++ [(k,v) | (k,v) <- kvs, k `elem` otherAttrs] title' <- inlinesToJATS opts ils @@ -247,21 +262,26 @@ blockToJATS opts (Div (id',"section":_,kvs) (Header _lvl _ ils : xs)) = do return $ inTags True "sec" attribs $ inTagsSimple "title" title' $$ contents -- Bibliography reference: -blockToJATS opts (Div (T.stripPrefix "ref-" -> Just _,_,_) [Para lst]) = +blockToJATS opts (Div (ident,_,_) [Para lst]) | "ref-" `T.isPrefixOf` ident = + inTags True "ref" [("id", escapeNCName ident)] . + inTagsSimple "mixed-citation" <$> inlinesToJATS opts lst blockToJATS opts (Div ("refs",_,_) xs) = do - contents <- blocksToJATS opts xs + refs <- asks jatsReferences + contents <- if null refs + then blocksToJATS opts xs + else referencesToJATS opts refs return $ inTagsIndented "ref-list" contents blockToJATS opts (Div (ident,[cls],kvs) bs) | cls `elem` ["fig", "caption", "table-wrap"] = do contents <- blocksToJATS opts bs - let attr = [("id", ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("xml:lang",l) | ("lang",l) <- kvs] ++ [(k,v) | (k,v) <- kvs, k `elem` ["specific-use", "content-type", "orientation", "position"]] return $ inTags True cls attr contents blockToJATS opts (Div (ident,_,kvs) bs) = do contents <- blocksToJATS opts bs - let attr = [("id", ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("xml:lang",l) | ("lang",l) <- kvs] ++ [(k,v) | (k,v) <- kvs, k `elem` ["specific-use", "content-type", "orientation", "position"]] @@ -279,7 +299,7 @@ blockToJATS opts (Para [Image (ident,_,kvs) txt let capt = if null txt then empty else inTagsSimple "caption" $ inTagsSimple "p" alt - let attr = [("id", ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [(k,v) | (k,v) <- kvs, k `elem` ["fig-type", "orientation", "position", "specific-use"]] let graphicattr = [("mimetype",maintype), @@ -290,7 +310,7 @@ blockToJATS opts (Para [Image (ident,_,kvs) txt capt $$ selfClosingTag "graphic" graphicattr blockToJATS _ (Para [Image (ident,_,kvs) _ (src, tit)]) = do let (maintype, subtype) = imageMimeType src kvs - let attr = [("id", ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("mimetype", maintype), ("mime-subtype", subtype), ("xlink:href", src)] ++ @@ -306,13 +326,16 @@ blockToJATS opts (LineBlock lns) = blockToJATS opts $ linesToPara lns blockToJATS opts (BlockQuote blocks) = do tagSet <- asks jatsTagSet - let blocksToJats' = if tagSet == TagSetArticleAuthoring - then wrappedBlocksToJATS (not . isPara) - else blocksToJATS - inTagsIndented "disp-quote" <$> blocksToJats' opts blocks -blockToJATS _ (CodeBlock a str) = return $ + let needsWrap = if tagSet == TagSetArticleAuthoring + then not . isPara + else \case + Header{} -> True + HorizontalRule -> True + _ -> False + inTagsIndented "disp-quote" <$> wrappedBlocksToJATS needsWrap opts blocks +blockToJATS opts (CodeBlock a str) = return $ inTags False tag attr (flush (text (T.unpack $ escapeStringForXML str))) - where (lang, attr) = codeAttr a + where (lang, attr) = codeAttr opts a tag = if T.null lang then "preformat" else "code" blockToJATS _ (BulletList []) = return empty blockToJATS opts (BulletList lst) = @@ -392,9 +415,9 @@ inlineToJATS opts (Quoted SingleQuote lst) = do inlineToJATS opts (Quoted DoubleQuote lst) = do contents <- inlinesToJATS opts lst return $ char '“' <> contents <> char '”' -inlineToJATS _ (Code a str) = +inlineToJATS opts (Code a str) = return $ inTags False tag attr $ literal (escapeStringForXML str) - where (lang, attr) = codeAttr a + where (lang, attr) = codeAttr opts a tag = if T.null lang then "monospace" else "code" inlineToJATS _ il@(RawInline f x) | f == "jats" = return $ literal x @@ -417,7 +440,8 @@ inlineToJATS opts (Note contents) = do let notenum = case notes of (n, _):_ -> n + 1 [] -> 1 - thenote <- inTags True "fn" [("id","fn" <> tshow notenum)] + thenote <- inTags True "fn" [("id", "fn" <> tshow notenum)] + . (inTagsSimple "label" (literal $ tshow notenum) <>) <$> wrappedBlocksToJATS (not . isPara) opts (walk demoteHeaderAndRefs contents) modify $ \st -> st{ jatsNotes = (notenum, thenote) : notes } @@ -425,18 +449,34 @@ inlineToJATS opts (Note contents) = do ("rid", "fn" <> tshow notenum)] $ text (show notenum) inlineToJATS opts (Cite _ lst) = - -- TODO revisit this after examining the jats.csl pipeline inlinesToJATS opts lst -inlineToJATS opts (Span ("",_,[]) ils) = inlinesToJATS opts ils -inlineToJATS opts (Span (ident,_,kvs) ils) = do +inlineToJATS opts (Span (ident,classes,kvs) ils) = do contents <- inlinesToJATS opts ils - let attr = [("id",ident) | not (T.null ident)] ++ - [("xml:lang",l) | ("lang",l) <- kvs] ++ - [(k,v) | (k,v) <- kvs - , k `elem` ["content-type", "rationale", - "rid", "specific-use"]] - return $ selfClosingTag "milestone-start" attr <> contents <> - selfClosingTag "milestone-end" [] + let commonAttr = [("id", escapeNCName ident) | not (T.null ident)] ++ + [("xml:lang",l) | ("lang",l) <- kvs] ++ + [(k,v) | (k,v) <- kvs, k `elem` ["alt", "specific-use"]] + -- A named-content element is a good fit for spans, but requires a + -- content-type attribute to be present. We use either the explicit + -- attribute or the first class as content type. If neither is + -- available, then we fall back to using a @styled-content@ element. + let (tag, specificAttr) = + case lookup "content-type" kvs <|> listToMaybe classes of + Just ct -> ( "named-content" + , ("content-type", ct) : + [(k, v) | (k, v) <- kvs + , k `elem` ["rid", "vocab", "vocab-identifier", + "vocab-term", "vocab-term-identifier"]]) + -- Fall back to styled-content + Nothing -> ("styled-content" + , [(k, v) | (k,v) <- kvs + , k `elem` ["style", "style-type", "style-detail", + "toggle"]]) + let attr = commonAttr ++ specificAttr + -- unwrap if wrapping element would have no attributes + return $ + if null attr + then contents + else inTags False tag attr contents inlineToJATS _ (Math t str) = do let addPref (Xml.Attr q v) | Xml.qName q == "xmlns" = Xml.Attr q{ Xml.qName = "xmlns:mml" } v @@ -470,17 +510,20 @@ inlineToJATS _ (Link _attr [Str t] (T.stripPrefix "mailto:" -> Just email, _)) | escapeURI t == email = return $ inTagsSimple "email" $ literal (escapeStringForXML email) inlineToJATS opts (Link (ident,_,kvs) txt (T.uncons -> Just ('#', src), _)) = do - let attr = [("id", ident) | not (T.null ident)] ++ - [("alt", stringify txt) | not (null txt)] ++ - [("rid", src)] ++ - [(k,v) | (k,v) <- kvs, k `elem` ["ref-type", "specific-use"]] + let attr = mconcat + [ [("id", escapeNCName ident) | not (T.null ident)] + , [("alt", stringify txt) | not (null txt)] + , [("rid", escapeNCName src)] + , [(k,v) | (k,v) <- kvs, k `elem` ["ref-type", "specific-use"]] + , [("ref-type", "bibr") | "ref-" `T.isPrefixOf` src] + ] if null txt then return $ selfClosingTag "xref" attr else do contents <- inlinesToJATS opts txt return $ inTags False "xref" attr contents inlineToJATS opts (Link (ident,_,kvs) txt (src, tit)) = do - let attr = [("id", ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("ext-link-type", "uri"), ("xlink:href", src)] ++ [("xlink:title", tit) | not (T.null tit)] ++ @@ -498,7 +541,7 @@ inlineToJATS _ (Image (ident,_,kvs) _ (src, tit)) = do let subtype = fromMaybe "" $ lookup "mime-subtype" kvs `mplus` (T.drop 1 . T.dropWhile (/='/') <$> mbMT) - let attr = [("id", ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("mimetype", maintype), ("mime-subtype", subtype), ("xlink:href", src)] ++ @@ -529,7 +572,7 @@ demoteHeaderAndRefs (Div ("refs",cls,kvs) bs) = demoteHeaderAndRefs x = x parseDate :: Text -> Maybe Day -parseDate s = msum (map (\fs -> parsetimeWith fs $ T.unpack s) formats) :: Maybe Day +parseDate s = msum (map (`parsetimeWith` T.unpack s) formats) where parsetimeWith = parseTimeM True defaultTimeLocale formats = ["%x","%m/%d/%Y", "%D","%F", "%d %b %Y", "%e %B %Y", "%b. %e, %Y", "%B %e, %Y", diff --git a/src/Text/Pandoc/Writers/JATS/References.hs b/src/Text/Pandoc/Writers/JATS/References.hs new file mode 100644 index 000000000..5b19fd034 --- /dev/null +++ b/src/Text/Pandoc/Writers/JATS/References.hs @@ -0,0 +1,164 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Writers.JATS.References + Copyright : © 2021 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel <tarleb@zeitkraut.de> + Stability : alpha + Portability : portable + +Creation of a bibliography list using @<element-citation>@ elements in +reference items. +-} +module Text.Pandoc.Writers.JATS.References + ( referencesToJATS + , referenceToJATS + ) where + +import Citeproc.Pandoc () +import Citeproc.Types + ( Date (..), DateParts (..), ItemId (..), Name (..), Reference (..) + , Val (..) , lookupVariable, valToText + ) +import Data.Text (Text) +import Text.DocLayout (Doc, empty, isEmpty, literal, vcat) +import Text.Pandoc.Class.PandocMonad (PandocMonad) +import Text.Pandoc.Builder (Inlines) +import Text.Pandoc.Options (WriterOptions) +import Text.Pandoc.Shared (tshow) +import Text.Pandoc.Writers.JATS.Types +import Text.Pandoc.XML (escapeNCName, escapeStringForXML, inTags) +import qualified Data.Text as T + +referencesToJATS :: PandocMonad m + => WriterOptions + -> [Reference Inlines] + -> JATS m (Doc Text) +referencesToJATS opts = + fmap (inTags True "ref-list" [] . vcat) . mapM (referenceToJATS opts) + +referenceToJATS :: PandocMonad m + => WriterOptions + -> Reference Inlines + -> JATS m (Doc Text) +referenceToJATS _opts ref = do + let refType = referenceType ref + let pubType = [("publication-type", refType) | not (T.null refType)] + let ident = escapeNCName $ "ref-" <> unItemId (referenceId ref) + let wrap = inTags True "ref" [("id", ident)] + . inTags True "element-citation" pubType + return . wrap . vcat $ + [ authors + , "title" `varInTag` + if refType == "book" + then "source" + else "article-title" + , if refType == "book" + then empty + else "container-title" `varInTag` "source" + , editors + , "publisher" `varInTag` "publisher-name" + , "publisher-place" `varInTag` "publisher-loc" + , yearTag + , accessed + , "volume" `varInTag` "volume" + , "issue" `varInTag` "issue" + , "page-first" `varInTag` "fpage" + , "page-last" `varInTag` "lpage" + , "pages" `varInTag` "page-range" + , "ISBN" `varInTag` "isbn" + , "ISSN" `varInTag` "issn" + , varInTagWith "doi" "pub-id" [("pub-id-type", "doi")] + , varInTagWith "pmid" "pub-id" [("pub-id-type", "pmid")] + ] + where + varInTag var tagName = varInTagWith var tagName [] + + varInTagWith var tagName tagAttribs = + case lookupVariable var ref >>= valToText of + Nothing -> mempty + Just val -> inTags' tagName tagAttribs . literal $ + escapeStringForXML val + + authors = case lookupVariable "author" ref of + Just (NamesVal names) -> + inTags True "person-group" [("person-group-type", "author")] . vcat $ + map toNameElements names + _ -> empty + + editors = case lookupVariable "editor" ref of + Just (NamesVal names) -> + inTags True "person-group" [("person-group-type", "editor")] . vcat $ + map toNameElements names + _ -> empty + + yearTag = + case lookupVariable "issued" ref of + Just (DateVal date) -> toDateElements date + _ -> empty + + accessed = + case lookupVariable "accessed" ref of + Just (DateVal d) -> inTags' "date-in-citation" + [("content-type", "access-date")] + (toDateElements d) + _ -> empty + +toDateElements :: Date -> Doc Text +toDateElements date = + case dateParts date of + dp@(DateParts (y:m:d:_)):_ -> yearElement y dp <> + monthElement m <> + dayElement d + dp@(DateParts (y:m:_)):_ -> yearElement y dp <> monthElement m + dp@(DateParts (y:_)):_ -> yearElement y dp + _ -> empty + +yearElement :: Int -> DateParts -> Doc Text +yearElement year dp = + inTags' "year" [("iso-8601-date", iso8601 dp)] $ literal (fourDigits year) + +monthElement :: Int -> Doc Text +monthElement month = inTags' "month" [] . literal $ twoDigits month + +dayElement :: Int -> Doc Text +dayElement day = inTags' "day" [] . literal $ twoDigits day + +iso8601 :: DateParts -> Text +iso8601 = T.intercalate "-" . \case + DateParts (y:m:d:_) -> [fourDigits y, twoDigits m, twoDigits d] + DateParts (y:m:_) -> [fourDigits y, twoDigits m] + DateParts (y:_) -> [fourDigits y] + _ -> [] + +twoDigits :: Int -> Text +twoDigits n = T.takeEnd 2 $ '0' `T.cons` tshow n + +fourDigits :: Int -> Text +fourDigits n = T.takeEnd 4 $ "000" <> tshow n + +toNameElements :: Name -> Doc Text +toNameElements name = + if not (isEmpty nameTags) + then inTags' "name" [] nameTags + else nameLiteral name `inNameTag` "string-name" + where + inNameTag mVal tag = case mVal of + Nothing -> empty + Just val -> inTags' tag [] . literal $ escapeStringForXML val + surnamePrefix = maybe mempty (`T.snoc` ' ') $ + nameNonDroppingParticle name + givenSuffix = maybe mempty (T.cons ' ') $ + nameDroppingParticle name + nameTags = mconcat + [ ((surnamePrefix <>) <$> nameFamily name) `inNameTag` "surname" + , ((<> givenSuffix) <$> nameGiven name) `inNameTag` "given-names" + , nameSuffix name `inNameTag` "suffix" + ] + +-- | Put the supplied contents between start and end tags of tagType, +-- with specified attributes. +inTags' :: Text -> [(Text, Text)] -> Doc Text -> Doc Text +inTags' = inTags False diff --git a/src/Text/Pandoc/Writers/JATS/Table.hs b/src/Text/Pandoc/Writers/JATS/Table.hs index a4d42832d..70569bdcd 100644 --- a/src/Text/Pandoc/Writers/JATS/Table.hs +++ b/src/Text/Pandoc/Writers/JATS/Table.hs @@ -3,7 +3,7 @@ {-# LANGUAGE TupleSections #-} {- | Module : Text.Pandoc.Writers.JATS.Table - Copyright : © 2020 Albert Krewinkel + Copyright : © 2020-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb@zeitkraut.de> @@ -24,7 +24,7 @@ import Text.Pandoc.Definition import Text.Pandoc.Options (WriterOptions) import Text.Pandoc.Shared (tshow) import Text.Pandoc.Writers.JATS.Types -import Text.Pandoc.XML (inTags, inTagsIndented, selfClosingTag) +import Text.Pandoc.XML (escapeNCName, inTags, inTagsIndented, selfClosingTag) import qualified Data.Text as T import qualified Text.Pandoc.Writers.AnnotatedTable as Ann @@ -34,13 +34,19 @@ tableToJATS :: PandocMonad m -> JATS m (Doc Text) tableToJATS opts (Ann.Table attr caption colspecs thead tbodies tfoot) = do let (Caption _maybeShortCaption captionBlocks) = caption + -- Only paragraphs are allowed in captions, all other blocks must be + -- wrapped in @<p>@ elements. + let needsWrapping = \case + Plain{} -> False + Para{} -> False + _ -> True tbl <- captionlessTable opts attr colspecs thead tbodies tfoot captionDoc <- if null captionBlocks then return empty else do blockToJATS <- asks jatsBlockWriter - inTagsIndented "caption" . vcat <$> - mapM (blockToJATS opts) captionBlocks + inTagsIndented "caption" <$> + blockToJATS needsWrapping opts captionBlocks return $ inTags True "table-wrap" [] $ captionDoc $$ tbl captionlessTable :: PandocMonad m @@ -216,7 +222,7 @@ cellToJats opts celltype (Ann.Cell (colspec :| _) _colNum cell) = toAttribs :: Attr -> [Text] -> [(Text, Text)] toAttribs (ident, _classes, kvs) knownAttribs = - (if T.null ident then id else (("id", ident) :)) $ + (if T.null ident then id else (("id", escapeNCName ident) :)) $ filter ((`elem` knownAttribs) . fst) kvs tableCellToJats :: PandocMonad m @@ -230,7 +236,7 @@ tableCellToJats opts ctype colAlign (Cell attr align rowspan colspan item) = do inlinesToJats <- asks jatsInlinesWriter let cellContents = \case [Plain inlines] -> inlinesToJats opts inlines - blocks -> vcat <$> mapM (blockToJats opts) blocks + blocks -> blockToJats needsWrapInCell opts blocks let tag' = case ctype of BodyCell -> "td" HeaderCell -> "th" @@ -246,3 +252,17 @@ tableCellToJats opts ctype colAlign (Cell attr align rowspan colspan item) = do . maybeCons (colspanAttrib colspan) $ toAttribs attr validAttribs inTags False tag' attribs <$> cellContents item + +-- | Whether the JATS produced from this block should be wrapped in a +-- @<p>@ element when put directly below a @<td>@ element. +needsWrapInCell :: Block -> Bool +needsWrapInCell = \case + Plain{} -> False -- should be unwrapped anyway + Para{} -> False + BulletList{} -> False + OrderedList{} -> False + DefinitionList{} -> False + HorizontalRule -> False + CodeBlock{} -> False + RawBlock{} -> False -- responsibility of the user + _ -> True diff --git a/src/Text/Pandoc/Writers/JATS/Types.hs b/src/Text/Pandoc/Writers/JATS/Types.hs index 8162f3bc0..8d8673cf6 100644 --- a/src/Text/Pandoc/Writers/JATS/Types.hs +++ b/src/Text/Pandoc/Writers/JATS/Types.hs @@ -1,6 +1,6 @@ {- | Module : Text.Pandoc.Writers.JATS.Types - Copyright : Copyright (C) 2017-2020 John MacFarlane + Copyright : Copyright (C) 2017-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -17,11 +17,12 @@ module Text.Pandoc.Writers.JATS.Types ) where +import Citeproc.Types (Reference) import Control.Monad.Reader (ReaderT) import Control.Monad.State (StateT) import Data.Text (Text) import Text.DocLayout (Doc) -import Text.Pandoc.Definition (Block, Inline) +import Text.Pandoc.Builder (Block, Inline, Inlines) import Text.Pandoc.Options (WriterOptions) -- | JATS tag set variant @@ -36,10 +37,20 @@ newtype JATSState = JATSState { jatsNotes :: [(Int, Doc Text)] } +-- | Environment containing all information relevant for rendering. data JATSEnv m = JATSEnv - { jatsTagSet :: JATSTagSet + { jatsTagSet :: JATSTagSet -- ^ The tag set that's being ouput + + , jatsBlockWriter :: (Block -> Bool) + -> WriterOptions -> [Block] -> JATS m (Doc Text) + -- ^ Converts a block list to JATS, wrapping top-level blocks into a + -- @<p>@ element if the property evaluates to @True@. + -- See #7227. + , jatsInlinesWriter :: WriterOptions -> [Inline] -> JATS m (Doc Text) - , jatsBlockWriter :: WriterOptions -> Block -> JATS m (Doc Text) + -- ^ Converts an inline list to JATS. + + , jatsReferences :: [Reference Inlines] -- ^ List of references } -- | JATS writer type diff --git a/src/Text/Pandoc/Writers/Jira.hs b/src/Text/Pandoc/Writers/Jira.hs index 6bc048a61..1351814e9 100644 --- a/src/Text/Pandoc/Writers/Jira.hs +++ b/src/Text/Pandoc/Writers/Jira.hs @@ -3,7 +3,7 @@ {-# LANGUAGE PatternGuards #-} {- | Module : Text.Pandoc.Writers.Jira - Copyright : © 2010-2020 Albert Krewinkel, John MacFarlane + Copyright : © 2010-2021 Albert Krewinkel, John MacFarlane License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -39,11 +39,17 @@ writeJira :: PandocMonad m => WriterOptions -> Pandoc -> m Text writeJira opts = runDefaultConverter (writerWrapText opts) (pandocToJira opts) -- | State to keep track of footnotes. -newtype ConverterState = ConverterState { stNotes :: [Text] } +data ConverterState = ConverterState + { stNotes :: [Text] -- ^ Footnotes to be appended to the end of the text + , stInPanel :: Bool -- ^ whether we are in a @{panel}@ block + } -- | Initial converter state. startState :: ConverterState -startState = ConverterState { stNotes = [] } +startState = ConverterState + { stNotes = [] + , stInPanel = False + } -- | Converter monad type JiraConverter m = ReaderT WrapOption (StateT ConverterState m) @@ -126,13 +132,19 @@ toJiraCode :: PandocMonad m -> Text -> JiraConverter m [Jira.Block] toJiraCode (ident, classes, _attribs) code = do - let lang = case find (\c -> T.toLower c `elem` knownLanguages) classes of - Nothing -> Jira.Language "java" - Just l -> Jira.Language l - let addAnchor b = if T.null ident - then b - else [Jira.Para (singleton (Jira.Anchor ident))] <> b - return . addAnchor . singleton $ Jira.Code lang mempty code + return . addAnchor ident . singleton $ + case find (\c -> T.toLower c `elem` knownLanguages) classes of + Nothing -> Jira.NoFormat mempty code + Just l -> Jira.Code (Jira.Language l) mempty code + +-- | Prepends an anchor with the given identifier. +addAnchor :: Text -> [Jira.Block] -> [Jira.Block] +addAnchor ident = + if T.null ident + then id + else \case + Jira.Para xs : bs -> (Jira.Para (Jira.Anchor ident : xs) : bs) + bs -> (Jira.Para (singleton (Jira.Anchor ident)) : bs) -- | Creates a Jira definition list toJiraDefinitionList :: PandocMonad m @@ -149,11 +161,16 @@ toJiraDefinitionList defItems = do toJiraPanel :: PandocMonad m => Attr -> [Block] -> JiraConverter m [Jira.Block] -toJiraPanel attr blocks = do - jiraBlocks <- toJiraBlocks blocks - return $ if attr == nullAttr - then jiraBlocks - else singleton (Jira.Panel [] jiraBlocks) +toJiraPanel (ident, classes, attribs) blocks = do + inPanel <- gets stInPanel + if inPanel || ("panel" `notElem` classes && null attribs) + then addAnchor ident <$> toJiraBlocks blocks + else do + modify $ \st -> st{ stInPanel = True } + jiraBlocks <- toJiraBlocks blocks + modify $ \st -> st{ stInPanel = inPanel } + let params = map (uncurry Jira.Parameter) attribs + return $ singleton (Jira.Panel params $ addAnchor ident jiraBlocks) -- | Creates a Jira header toJiraHeader :: PandocMonad m @@ -263,6 +280,8 @@ toJiraLink (_, classes, _) (url, _) alias = do | Just email <- T.stripPrefix "mailto:" url' = (Jira.Email, email) | "user-account" `elem` classes = (Jira.User, dropTilde url) | "attachment" `elem` classes = (Jira.Attachment, url) + | "smart-card" `elem` classes = (Jira.SmartCard, url) + | "smart-link" `elem` classes = (Jira.SmartLink, url) | otherwise = (Jira.External, url) dropTilde txt = case T.uncons txt of Just ('~', username) -> username @@ -292,7 +311,13 @@ quotedToJira qtype xs = do spanToJira :: PandocMonad m => Attr -> [Inline] -> JiraConverter m [Jira.Inline] -spanToJira (_, _classes, _) = toJiraInlines +spanToJira (ident, _classes, attribs) inls = + let wrap = case lookup "color" attribs of + Nothing -> id + Just color -> singleton . Jira.ColorInline (Jira.ColorName color) + in wrap <$> case ident of + "" -> toJiraInlines inls + _ -> (Jira.Anchor ident :) <$> toJiraInlines inls registerNotes :: PandocMonad m => [Block] -> JiraConverter m [Jira.Inline] registerNotes contents = do @@ -308,7 +333,7 @@ registerNotes contents = do knownLanguages :: [Text] knownLanguages = [ "actionscript", "ada", "applescript", "bash", "c", "c#", "c++" - , "css", "erlang", "go", "groovy", "haskell", "html", "javascript" + , "css", "erlang", "go", "groovy", "haskell", "html", "java", "javascript" , "json", "lua", "nyan", "objc", "perl", "php", "python", "r", "ruby" , "scala", "sql", "swift", "visualbasic", "xml", "yaml" ] diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs index d665269ab..063e347fb 100644 --- a/src/Text/Pandoc/Writers/LaTeX.hs +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -1,11 +1,12 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Writers.LaTeX - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -18,12 +19,9 @@ module Text.Pandoc.Writers.LaTeX ( writeLaTeX , writeBeamer ) where -import Control.Applicative ((<|>)) import Control.Monad.State.Strict -import Data.Monoid (Any(..)) -import Data.Char (isAlphaNum, isAscii, isDigit, isLetter, isSpace, - isPunctuation, ord) -import Data.List (foldl', intersperse, nubBy, (\\), uncons) +import Data.Char (isDigit) +import Data.List (intersperse, nubBy, (\\)) import Data.Maybe (catMaybes, fromMaybe, isJust, mapMaybe, isNothing) import qualified Data.Map as M import Data.Text (Text) @@ -31,79 +29,30 @@ import qualified Data.Text as T import Network.URI (unEscapeString) import Text.DocTemplates (FromContext(lookupContext), renderTemplate, Val(..), Context(..)) -import Text.Pandoc.BCP47 (Lang (..), getLang, renderLang) +import Text.Collate.Lang (Lang (..), renderLang) import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang) import Text.Pandoc.Definition import Text.Pandoc.Highlighting (formatLaTeXBlock, formatLaTeXInline, highlight, - styleToLaTeX, toListingsLanguage) + styleToLaTeX) import Text.Pandoc.ImageSize import Text.Pandoc.Logging import Text.Pandoc.Options import Text.DocLayout import Text.Pandoc.Shared import Text.Pandoc.Slides -import Text.Pandoc.Walk +import Text.Pandoc.Walk (query, walk, walkM) +import Text.Pandoc.Writers.LaTeX.Caption (getCaption) +import Text.Pandoc.Writers.LaTeX.Table (tableToLaTeX) +import Text.Pandoc.Writers.LaTeX.Citation (citationsToNatbib, + citationsToBiblatex) +import Text.Pandoc.Writers.LaTeX.Types (LW, WriterState (..), startingState) +import Text.Pandoc.Writers.LaTeX.Lang (toPolyglossia, toBabel) +import Text.Pandoc.Writers.LaTeX.Util (stringToLaTeX, StringContext(..), + toLabel, inCmd, + wrapDiv, hypertarget, labelFor, + getListingsLanguage, mbBraced) import Text.Pandoc.Writers.Shared -import Text.Printf (printf) -import qualified Data.Text.Normalize as Normalize - -data WriterState = - WriterState { stInNote :: Bool -- true if we're in a note - , stInQuote :: Bool -- true if in a blockquote - , stExternalNotes :: Bool -- true if in context where - -- we need to store footnotes - , stInMinipage :: Bool -- true if in minipage - , stInHeading :: Bool -- true if in a section heading - , stInItem :: Bool -- true if in \item[..] - , stNotes :: [Doc Text] -- notes in a minipage - , stOLLevel :: Int -- level of ordered list nesting - , stOptions :: WriterOptions -- writer options, so they don't have to be parameter - , stVerbInNote :: Bool -- true if document has verbatim text in note - , stTable :: Bool -- true if document has a table - , stStrikeout :: Bool -- true if document has strikeout - , stUrl :: Bool -- true if document has visible URL link - , stGraphics :: Bool -- true if document contains images - , stLHS :: Bool -- true if document has literate haskell code - , stHasChapters :: Bool -- true if document has chapters - , stCsquotes :: Bool -- true if document uses csquotes - , stHighlighting :: Bool -- true if document has highlighted code - , stIncremental :: Bool -- true if beamer lists should be displayed bit by bit - , stInternalLinks :: [Text] -- list of internal link targets - , stBeamer :: Bool -- produce beamer - , stEmptyLine :: Bool -- true if no content on line - , stHasCslRefs :: Bool -- has a Div with class refs - , stIsFirstInDefinition :: Bool -- first block in a defn list - } - -startingState :: WriterOptions -> WriterState -startingState options = WriterState { - stInNote = False - , stInQuote = False - , stExternalNotes = False - , stInHeading = False - , stInMinipage = False - , stInItem = False - , stNotes = [] - , stOLLevel = 1 - , stOptions = options - , stVerbInNote = False - , stTable = False - , stStrikeout = False - , stUrl = False - , stGraphics = False - , stLHS = False - , stHasChapters = case writerTopLevelDivision options of - TopLevelPart -> True - TopLevelChapter -> True - _ -> False - , stCsquotes = False - , stHighlighting = False - , stIncremental = writerIncremental options - , stInternalLinks = [] - , stBeamer = False - , stEmptyLine = True - , stHasCslRefs = False - , stIsFirstInDefinition = False } +import qualified Text.Pandoc.Writers.AnnotatedTable as Ann -- | Convert Pandoc to LaTeX. writeLaTeX :: PandocMonad m => WriterOptions -> Pandoc -> m Text @@ -117,8 +66,6 @@ writeBeamer options document = evalStateT (pandocToLaTeX options document) $ (startingState options){ stBeamer = True } -type LW m = StateT WriterState m - pandocToLaTeX :: PandocMonad m => WriterOptions -> Pandoc -> LW m Text pandocToLaTeX options (Pandoc meta blocks) = do @@ -199,6 +146,11 @@ pandocToLaTeX options (Pandoc meta blocks) = do let dirs = query (extract "dir") blocks + let nociteIds = query (\case + Cite cs _ -> map citationId cs + _ -> []) + $ lookupMetaInlines "nocite" meta + let context = defField "toc" (writerTableOfContents options) $ defField "toc-depth" (tshow (writerTOCDepth options - @@ -212,6 +164,7 @@ pandocToLaTeX options (Pandoc meta blocks) = do defField "documentclass" documentClass $ defField "verbatim-in-note" (stVerbInNote st) $ defField "tables" (stTable st) $ + defField "multirow" (stMultiRow st) $ defField "strikeout" (stStrikeout st) $ defField "url" (stUrl st) $ defField "numbersections" (writerNumberSections options) $ @@ -220,6 +173,7 @@ pandocToLaTeX options (Pandoc meta blocks) = do defField "has-chapters" (stHasChapters st) $ defField "has-frontmatter" (documentClass `elem` frontmatterClasses) $ defField "listings" (writerListings options || stLHS st) $ + defField "zero-width-non-joiner" (stZwnj st) $ defField "beamer" beamer $ (if stHighlighting st then case writerHighlightStyle options of @@ -230,9 +184,11 @@ pandocToLaTeX options (Pandoc meta blocks) = do else id) $ (case writerCiteMethod options of Natbib -> defField "biblio-title" biblioTitle . - defField "natbib" True + defField "natbib" True . + defField "nocite-ids" nociteIds Biblatex -> defField "biblio-title" biblioTitle . - defField "biblatex" True + defField "biblatex" True . + defField "nocite-ids" nociteIds _ -> id) $ defField "colorlinks" (any hasStringValue ["citecolor", "urlcolor", "linkcolor", "toccolor", @@ -297,154 +253,6 @@ pandocToLaTeX options (Pandoc meta blocks) = do Nothing -> main Just tpl -> renderTemplate tpl context' -data StringContext = TextString - | URLString - | CodeString - deriving (Eq) - --- escape things as needed for LaTeX -stringToLaTeX :: PandocMonad m => StringContext -> Text -> LW m Text -stringToLaTeX context zs = do - opts <- gets stOptions - return $ T.pack $ - foldr (go opts context) mempty $ T.unpack $ - if writerPreferAscii opts - then Normalize.normalize Normalize.NFD zs - else zs - where - go :: WriterOptions -> StringContext -> Char -> String -> String - go opts ctx x xs = - let ligatures = isEnabled Ext_smart opts && ctx == TextString - isUrl = ctx == URLString - mbAccentCmd = - if writerPreferAscii opts && ctx == TextString - then uncons xs >>= \(c,_) -> M.lookup c accents - else Nothing - emits s = - case mbAccentCmd of - Just cmd -> - cmd <> "{" <> s <> "}" <> drop 1 xs -- drop combining accent - Nothing -> s <> xs - emitc c = - case mbAccentCmd of - Just cmd -> - cmd <> "{" <> [c] <> "}" <> drop 1 xs -- drop combining accent - Nothing -> c : xs - emitcseq cs = - case xs of - c:_ | isLetter c - , ctx == TextString - -> cs <> " " <> xs - | isSpace c -> cs <> "{}" <> xs - | ctx == TextString - -> cs <> xs - _ -> cs <> "{}" <> xs - emitquote cs = - case xs of - '`':_ -> cs <> "\\," <> xs -- add thin space - '\'':_ -> cs <> "\\," <> xs -- add thin space - _ -> cs <> xs - in case x of - '?' | ligatures -> -- avoid ?` ligature - case xs of - '`':_ -> emits "?{}" - _ -> emitc x - '!' | ligatures -> -- avoid !` ligature - case xs of - '`':_ -> emits "!{}" - _ -> emitc x - '{' -> emits "\\{" - '}' -> emits "\\}" - '`' | ctx == CodeString -> emitcseq "\\textasciigrave" - '$' | not isUrl -> emits "\\$" - '%' -> emits "\\%" - '&' -> emits "\\&" - '_' | not isUrl -> emits "\\_" - '#' -> emits "\\#" - '-' | not isUrl -> case xs of - -- prevent adjacent hyphens from forming ligatures - ('-':_) -> emits "-\\/" - _ -> emitc '-' - '~' | not isUrl -> emitcseq "\\textasciitilde" - '^' -> emits "\\^{}" - '\\'| isUrl -> emitc '/' -- NB. / works as path sep even on Windows - | otherwise -> emitcseq "\\textbackslash" - '|' | not isUrl -> emitcseq "\\textbar" - '<' -> emitcseq "\\textless" - '>' -> emitcseq "\\textgreater" - '[' -> emits "{[}" -- to avoid interpretation as - ']' -> emits "{]}" -- optional arguments - '\'' | ctx == CodeString -> emitcseq "\\textquotesingle" - '\160' -> emits "~" - '\x200B' -> emits "\\hspace{0pt}" -- zero-width space - '\x202F' -> emits "\\," - '\x2026' -> emitcseq "\\ldots" - '\x2018' | ligatures -> emitquote "`" - '\x2019' | ligatures -> emitquote "'" - '\x201C' | ligatures -> emitquote "``" - '\x201D' | ligatures -> emitquote "''" - '\x2014' | ligatures -> emits "---" - '\x2013' | ligatures -> emits "--" - _ | writerPreferAscii opts - -> case x of - 'ı' -> emitcseq "\\i" - 'ȷ' -> emitcseq "\\j" - 'å' -> emitcseq "\\aa" - 'Å' -> emitcseq "\\AA" - 'ß' -> emitcseq "\\ss" - 'ø' -> emitcseq "\\o" - 'Ø' -> emitcseq "\\O" - 'Ł' -> emitcseq "\\L" - 'ł' -> emitcseq "\\l" - 'æ' -> emitcseq "\\ae" - 'Æ' -> emitcseq "\\AE" - 'œ' -> emitcseq "\\oe" - 'Œ' -> emitcseq "\\OE" - '£' -> emitcseq "\\pounds" - '€' -> emitcseq "\\euro" - '©' -> emitcseq "\\copyright" - _ -> emitc x - | otherwise -> emitc x - -accents :: M.Map Char String -accents = M.fromList - [ ('\779' , "\\H") - , ('\768' , "\\`") - , ('\769' , "\\'") - , ('\770' , "\\^") - , ('\771' , "\\~") - , ('\776' , "\\\"") - , ('\775' , "\\.") - , ('\772' , "\\=") - , ('\781' , "\\|") - , ('\817' , "\\b") - , ('\807' , "\\c") - , ('\783' , "\\G") - , ('\777' , "\\h") - , ('\803' , "\\d") - , ('\785' , "\\f") - , ('\778' , "\\r") - , ('\865' , "\\t") - , ('\782' , "\\U") - , ('\780' , "\\v") - , ('\774' , "\\u") - , ('\808' , "\\k") - , ('\785' , "\\newtie") - , ('\8413', "\\textcircled") - ] - -toLabel :: PandocMonad m => Text -> LW m Text -toLabel z = go `fmap` stringToLaTeX URLString z - where - go = T.concatMap $ \x -> case x of - _ | (isLetter x || isDigit x) && isAscii x -> T.singleton x - | x `elemText` "_-+=:;." -> T.singleton x - | otherwise -> T.pack $ "ux" <> printf "%x" (ord x) - --- | Puts contents into LaTeX command. -inCmd :: Text -> Doc Text -> Doc Text -inCmd cmd contents = char '\\' <> literal cmd <> braces contents - toSlides :: PandocMonad m => [Block] -> LW m [Block] toSlides bs = do opts <- gets stOptions @@ -483,7 +291,12 @@ blockToLaTeX :: PandocMonad m => Block -- ^ Block to convert -> LW m (Doc Text) blockToLaTeX Null = return empty -blockToLaTeX (Div attr@(identifier,"block":_,_) (Header _ _ ils : bs)) = do +blockToLaTeX (Div attr@(identifier,"block":dclasses,_) + (Header _ _ ils : bs)) = do + let blockname + | "example" `elem` dclasses = "exampleblock" + | "alert" `elem` dclasses = "alertblock" + | otherwise = "block" ref <- toLabel identifier let anchor = if T.null identifier then empty @@ -491,8 +304,8 @@ blockToLaTeX (Div attr@(identifier,"block":_,_) (Header _ _ ils : bs)) = do braces (literal ref) <> braces empty title' <- inlineListToLaTeX ils contents <- blockListToLaTeX bs - wrapDiv attr $ ("\\begin{block}" <> braces title' <> anchor) $$ - contents $$ "\\end{block}" + wrapDiv attr $ ("\\begin" <> braces blockname <> braces title' <> anchor) $$ + contents $$ "\\end" <> braces blockname blockToLaTeX (Div (identifier,"slide":dclasses,dkvs) (Header _ (_,hclasses,hkvs) ils : bs)) = do -- note: [fragile] is required or verbatim breaks @@ -553,17 +366,16 @@ blockToLaTeX (Div (identifier,classes,kvs) bs) = do Just s -> braces (literal s)) $$ inner $+$ "\\end{CSLReferences}" - else if "csl-entry" `elem` classes - then vcat <$> mapM cslEntryToLaTeX bs - else blockListToLaTeX bs + else blockListToLaTeX bs modify $ \st -> st{ stIncremental = oldIncremental } linkAnchor' <- hypertarget True identifier empty - -- see #2704 for the motivation for adding \leavevmode: + -- see #2704 for the motivation for adding \leavevmode + -- and #7078 for \vadjust pre let linkAnchor = case bs of Para _ : _ | not (isEmpty linkAnchor') - -> "\\leavevmode" <> linkAnchor' <> "%" + -> "\\leavevmode\\vadjust pre{" <> linkAnchor' <> "}%" _ -> linkAnchor' wrapNotes txt = if beamer && "notes" `elem` classes then "\\note" <> braces txt -- speaker notes @@ -575,7 +387,7 @@ blockToLaTeX (Plain lst) = blockToLaTeX (Para [Image attr@(ident, _, _) txt (src,tgt)]) | Just tit <- T.stripPrefix "fig:" tgt = do - (capt, captForLof, footnotes) <- getCaption True txt + (capt, captForLof, footnotes) <- getCaption inlineListToLaTeX True txt lab <- labelFor ident let caption = "\\caption" <> captForLof <> braces capt <> lab img <- inlineToLaTeX (Image attr txt (src,tit)) @@ -776,181 +588,14 @@ blockToLaTeX (Header level (id',classes,_) lst) = do hdr <- sectionHeader classes id' level lst modify $ \s -> s{stInHeading = False} return hdr -blockToLaTeX (Table _ blkCapt specs thead tbody tfoot) = do - let (caption, aligns, widths, heads, rows) = toLegacyTable blkCapt specs thead tbody tfoot - -- simple tables have to have simple cells: - let isSimple [Plain _] = True - isSimple [Para _] = True - isSimple [] = True - isSimple _ = False - let widths' = if all (== 0) widths && not (all (all isSimple) rows) - then replicate (length aligns) - (1 / fromIntegral (length aligns)) - else widths - (captionText, captForLof, captNotes) <- getCaption False caption - let toHeaders hs = do contents <- tableRowToLaTeX True aligns hs - return ("\\toprule" $$ contents $$ "\\midrule") - let removeNote (Note _) = Span ("", [], []) [] - removeNote x = x - firsthead <- if isEmpty captionText || all null heads - then return empty - else ($$ text "\\endfirsthead") <$> toHeaders heads - head' <- if all null heads - then return "\\toprule" - -- avoid duplicate notes in head and firsthead: - else toHeaders (if isEmpty firsthead - then heads - else walk removeNote heads) - let capt = if isEmpty captionText - then empty - else "\\caption" <> captForLof <> braces captionText - <> "\\tabularnewline" - rows' <- mapM (tableRowToLaTeX False aligns) rows - let colDescriptors = - (if all (== 0) widths' - then hcat . map literal - else (\xs -> cr <> nest 2 (vcat $ map literal xs))) $ - zipWith (toColDescriptor (length widths')) aligns widths' - modify $ \s -> s{ stTable = True } - notes <- notesToLaTeX <$> gets stNotes - return $ "\\begin{longtable}[]" <> - braces ("@{}" <> colDescriptors <> "@{}") - -- the @{} removes extra space at beginning and end - $$ capt - $$ firsthead - $$ head' - $$ "\\endhead" - $$ vcat rows' - $$ "\\bottomrule" - $$ "\\end{longtable}" - $$ captNotes - $$ notes - -getCaption :: PandocMonad m - => Bool -> [Inline] -> LW m (Doc Text, Doc Text, Doc Text) -getCaption externalNotes txt = do - oldExternalNotes <- gets stExternalNotes - modify $ \st -> st{ stExternalNotes = externalNotes, stNotes = [] } - capt <- inlineListToLaTeX txt - footnotes <- if externalNotes - then notesToLaTeX <$> gets stNotes - else return empty - modify $ \st -> st{ stExternalNotes = oldExternalNotes, stNotes = [] } - -- We can't have footnotes in the list of figures/tables, so remove them: - let getNote (Note _) = Any True - getNote _ = Any False - let hasNotes = getAny . query getNote - captForLof <- if hasNotes txt - then brackets <$> inlineListToLaTeX (walk deNote txt) - else return empty - return (capt, captForLof, footnotes) - -toColDescriptor :: Int -> Alignment -> Double -> Text -toColDescriptor _numcols align 0 = - case align of - AlignLeft -> "l" - AlignRight -> "r" - AlignCenter -> "c" - AlignDefault -> "l" -toColDescriptor numcols align width = - T.pack $ printf - ">{%s\\arraybackslash}p{(\\columnwidth - %d\\tabcolsep) * \\real{%0.2f}}" - align' - ((numcols - 1) * 2) - width - where - align' :: String - align' = case align of - AlignLeft -> "\\raggedright" - AlignRight -> "\\raggedleft" - AlignCenter -> "\\centering" - AlignDefault -> "\\raggedright" +blockToLaTeX (Table attr blkCapt specs thead tbodies tfoot) = + tableToLaTeX inlineListToLaTeX blockListToLaTeX + (Ann.toTable attr blkCapt specs thead tbodies tfoot) blockListToLaTeX :: PandocMonad m => [Block] -> LW m (Doc Text) blockListToLaTeX lst = vsep `fmap` mapM (\b -> setEmptyLine True >> blockToLaTeX b) lst -tableRowToLaTeX :: PandocMonad m - => Bool - -> [Alignment] - -> [[Block]] - -> LW m (Doc Text) -tableRowToLaTeX header aligns cols = do - cells <- mapM (tableCellToLaTeX header) $ zip aligns cols - return $ hsep (intersperse "&" cells) <> " \\\\ \\addlinespace" - --- For simple latex tables (without minipages or parboxes), --- we need to go to some lengths to get line breaks working: --- as LineBreak bs = \vtop{\hbox{\strut as}\hbox{\strut bs}}. -fixLineBreaks :: Block -> Block -fixLineBreaks (Para ils) = Para $ fixLineBreaks' ils -fixLineBreaks (Plain ils) = Plain $ fixLineBreaks' ils -fixLineBreaks x = x - -fixLineBreaks' :: [Inline] -> [Inline] -fixLineBreaks' ils = case splitBy (== LineBreak) ils of - [] -> [] - [xs] -> xs - chunks -> RawInline "tex" "\\vtop{" : - concatMap tohbox chunks <> - [RawInline "tex" "}"] - where tohbox ys = RawInline "tex" "\\hbox{\\strut " : ys <> - [RawInline "tex" "}"] - --- We also change display math to inline math, since display --- math breaks in simple tables. -displayMathToInline :: Inline -> Inline -displayMathToInline (Math DisplayMath x) = Math InlineMath x -displayMathToInline x = x - -tableCellToLaTeX :: PandocMonad m - => Bool -> (Alignment, [Block]) - -> LW m (Doc Text) -tableCellToLaTeX header (align, blocks) = do - beamer <- gets stBeamer - externalNotes <- gets stExternalNotes - inMinipage <- gets stInMinipage - -- See #5367 -- footnotehyper/footnote don't work in beamer, - -- so we need to produce the notes outside the table... - modify $ \st -> st{ stExternalNotes = beamer } - let isPlainOrPara Para{} = True - isPlainOrPara Plain{} = True - isPlainOrPara _ = False - result <- - if all isPlainOrPara blocks - then - blockListToLaTeX $ walk fixLineBreaks $ walk displayMathToInline blocks - else do - modify $ \st -> st{ stInMinipage = True } - cellContents <- blockListToLaTeX blocks - modify $ \st -> st{ stInMinipage = inMinipage } - let valign = text $ if header then "[b]" else "[t]" - let halign = case align of - AlignLeft -> "\\raggedright" - AlignRight -> "\\raggedleft" - AlignCenter -> "\\centering" - AlignDefault -> "\\raggedright" - return $ "\\begin{minipage}" <> valign <> - braces "\\linewidth" <> halign <> cr <> - cellContents <> cr <> - "\\end{minipage}" - modify $ \st -> st{ stExternalNotes = externalNotes } - return result - - -notesToLaTeX :: [Doc Text] -> Doc Text -notesToLaTeX [] = empty -notesToLaTeX ns = (case length ns of - n | n > 1 -> "\\addtocounter" <> - braces "footnote" <> - braces (text $ show $ 1 - n) - | otherwise -> empty) - $$ - vcat (intersperse - ("\\addtocounter" <> braces "footnote" <> braces "1") - $ map (\x -> "\\footnotetext" <> braces x) - $ reverse ns) - listItemToLaTeX :: PandocMonad m => [Block] -> LW m (Doc Text) listItemToLaTeX lst -- we need to put some text before a header if it's the first @@ -1077,81 +722,6 @@ sectionHeader classes ident level lst = do braces txtNoNotes else empty -mapAlignment :: Text -> Text -mapAlignment a = case a of - "top" -> "T" - "top-baseline" -> "t" - "bottom" -> "b" - "center" -> "c" - _ -> a - -wrapDiv :: PandocMonad m => Attr -> Doc Text -> LW m (Doc Text) -wrapDiv (_,classes,kvs) t = do - beamer <- gets stBeamer - let align dir txt = inCmd "begin" dir $$ txt $$ inCmd "end" dir - lang <- toLang $ lookup "lang" kvs - let wrapColumns = if beamer && "columns" `elem` classes - then \contents -> - let valign = maybe "T" mapAlignment (lookup "align" kvs) - totalwidth = maybe [] (\x -> ["totalwidth=" <> x]) - (lookup "totalwidth" kvs) - onlytextwidth = filter ("onlytextwidth" ==) classes - options = text $ T.unpack $ T.intercalate "," $ - valign : totalwidth ++ onlytextwidth - in inCmd "begin" "columns" <> brackets options - $$ contents - $$ inCmd "end" "columns" - else id - wrapColumn = if beamer && "column" `elem` classes - then \contents -> - let valign = - maybe "" - (brackets . text . T.unpack . mapAlignment) - (lookup "align" kvs) - w = maybe "0.48" fromPct (lookup "width" kvs) - in inCmd "begin" "column" <> - valign <> - braces (literal w <> "\\textwidth") - $$ contents - $$ inCmd "end" "column" - else id - fromPct xs = - case T.unsnoc xs of - Just (ds, '%') -> case safeRead ds of - Just digits -> showFl (digits / 100 :: Double) - Nothing -> xs - _ -> xs - wrapDir = case lookup "dir" kvs of - Just "rtl" -> align "RTL" - Just "ltr" -> align "LTR" - _ -> id - wrapLang txt = case lang of - Just lng -> let (l, o) = toPolyglossiaEnv lng - ops = if T.null o - then "" - else brackets $ literal o - in inCmd "begin" (literal l) <> ops - $$ blankline <> txt <> blankline - $$ inCmd "end" (literal l) - Nothing -> txt - return $ wrapColumns . wrapColumn . wrapDir . wrapLang $ t - -hypertarget :: PandocMonad m => Bool -> Text -> Doc Text -> LW m (Doc Text) -hypertarget _ "" x = return x -hypertarget addnewline ident x = do - ref <- literal `fmap` toLabel ident - return $ text "\\hypertarget" - <> braces ref - <> braces ((if addnewline && not (isEmpty x) - then "%" <> cr - else empty) <> x) - -labelFor :: PandocMonad m => Text -> LW m (Doc Text) -labelFor "" = return empty -labelFor ident = do - ref <- literal `fmap` toLabel ident - return $ text "\\label" <> braces ref - -- | Convert list of inline elements to LaTeX. inlineListToLaTeX :: PandocMonad m => [Inline] -- ^ Inlines to convert @@ -1176,27 +746,6 @@ inlineListToLaTeX lst = hcat <$> fixInitialLineBreaks xs fixInitialLineBreaks xs = xs -isQuoted :: Inline -> Bool -isQuoted (Quoted _ _) = True -isQuoted _ = False - -cslEntryToLaTeX :: PandocMonad m - => Block - -> LW m (Doc Text) -cslEntryToLaTeX (Para xs) = - mconcat <$> mapM go xs - where - go (Span ("",["csl-block"],[]) ils) = - (cr <>) . inCmd "CSLBlock" <$> inlineListToLaTeX ils - go (Span ("",["csl-left-margin"],[]) ils) = - inCmd "CSLLeftMargin" <$> inlineListToLaTeX ils - go (Span ("",["csl-right-inline"],[]) ils) = - (cr <>) . inCmd "CSLRightInline" <$> inlineListToLaTeX ils - go (Span ("",["csl-indent"],[]) ils) = - (cr <>) . inCmd "CSLIndent" <$> inlineListToLaTeX ils - go il = inlineToLaTeX il -cslEntryToLaTeX x = blockToLaTeX x - -- | Convert inline element to LaTeX inlineToLaTeX :: PandocMonad m => Inline -- ^ Inline to convert @@ -1204,23 +753,38 @@ inlineToLaTeX :: PandocMonad m inlineToLaTeX (Span (id',classes,kvs) ils) = do linkAnchor <- hypertarget False id' empty lang <- toLang $ lookup "lang" kvs - let cmds = ["textup" | "csl-no-emph" `elem` classes] ++ - ["textnormal" | "csl-no-strong" `elem` classes || - "csl-no-smallcaps" `elem` classes] ++ - ["RL" | ("dir", "rtl") `elem` kvs] ++ - ["LR" | ("dir", "ltr") `elem` kvs] ++ - (case lang of - Just lng -> let (l, o) = toPolyglossia lng - ops = if T.null o then "" else "[" <> o <> "]" - in ["text" <> l <> ops] - Nothing -> []) + let classToCmd "csl-no-emph" = Just "textup" + classToCmd "csl-no-strong" = Just "textnormal" + classToCmd "csl-no-smallcaps" = Just "textnormal" + classToCmd "csl-block" = Just "CSLBlock" + classToCmd "csl-left-margin" = Just "CSLLeftMargin" + classToCmd "csl-right-inline" = Just "CSLRightInline" + classToCmd "csl-indent" = Just "CSLIndent" + classToCmd _ = Nothing + kvToCmd ("dir","rtl") = Just "RL" + kvToCmd ("dir","ltr") = Just "LR" + kvToCmd _ = Nothing + langCmds = + case lang of + Just lng -> let (l, o) = toPolyglossia lng + ops = if T.null o then "" else "[" <> o <> "]" + in ["text" <> l <> ops] + Nothing -> [] + let cmds = mapMaybe classToCmd classes ++ mapMaybe kvToCmd kvs ++ langCmds contents <- inlineListToLaTeX ils - return $ (if T.null id' - then empty - else "\\protect" <> linkAnchor) <> - (if null cmds - then braces contents - else foldr inCmd contents cmds) + return $ + (case classes of + ["csl-block"] -> (cr <>) + ["csl-left-margin"] -> (cr <>) + ["csl-right-inline"] -> (cr <>) + ["csl-indent"] -> (cr <>) + _ -> id) $ + (if T.null id' + then empty + else "\\protect" <> linkAnchor) <> + (if null cmds + then braces contents + else foldr inCmd contents cmds) inlineToLaTeX (Emph lst) = inCmd "emph" <$> inlineListToLaTeX lst inlineToLaTeX (Underline lst) = inCmd "underline" <$> inlineListToLaTeX lst inlineToLaTeX (Strong lst) = inCmd "textbf" <$> inlineListToLaTeX lst @@ -1242,8 +806,8 @@ inlineToLaTeX (Cite cits lst) = do st <- get let opts = stOptions st case writerCiteMethod opts of - Natbib -> citationsToNatbib cits - Biblatex -> citationsToBiblatex cits + Natbib -> citationsToNatbib inlineListToLaTeX cits + Biblatex -> citationsToBiblatex inlineListToLaTeX cits _ -> inlineListToLaTeX lst inlineToLaTeX (Code (_,classes,kvs) str) = do @@ -1267,7 +831,19 @@ inlineToLaTeX (Code (_,classes,kvs) str) = do let chr = case "!\"'()*,-./:;?@" \\ T.unpack str of (c:_) -> c [] -> '!' - let str' = escapeStringUsing (backslashEscapes "\\{}%~_&#^") str + let isEscapable '\\' = True + isEscapable '{' = True + isEscapable '}' = True + isEscapable '%' = True + isEscapable '~' = True + isEscapable '_' = True + isEscapable '&' = True + isEscapable '#' = True + isEscapable '^' = True + isEscapable _ = False + let escChar c | isEscapable c = T.pack ['\\',c] + | otherwise = T.singleton c + let str' = T.concatMap escChar str -- we always put lstinline in a dummy 'passthrough' command -- (defined in the default template) so that we don't have -- to change the way we escape characters depending on whether @@ -1317,6 +893,10 @@ inlineToLaTeX (Quoted qt lst) = do if isEnabled Ext_smart opts then char '`' <> inner <> char '\'' else char '\x2018' <> inner <> char '\x2019' + where + isQuoted (Span _ (x:_)) = isQuoted x + isQuoted (Quoted _ _) = True + isQuoted _ = False inlineToLaTeX (Str str) = do setEmptyLine False liftM literal $ stringToLaTeX TextString str @@ -1339,7 +919,7 @@ inlineToLaTeX il@(RawInline f str) = do inlineToLaTeX LineBreak = do emptyLine <- gets stEmptyLine setEmptyLine True - return $ (if emptyLine then "~" else "") <> "\\\\" <> cr + return $ (if emptyLine then "\\strut " else "") <> "\\\\" <> cr inlineToLaTeX SoftBreak = do wrapText <- gets (writerWrapText . stOptions) case wrapText of @@ -1454,153 +1034,6 @@ protectCode x = [x] setEmptyLine :: PandocMonad m => Bool -> LW m () setEmptyLine b = modify $ \st -> st{ stEmptyLine = b } -citationsToNatbib :: PandocMonad m => [Citation] -> LW m (Doc Text) -citationsToNatbib - [one] - = citeCommand c p s k - where - Citation { citationId = k - , citationPrefix = p - , citationSuffix = s - , citationMode = m - } - = one - c = case m of - AuthorInText -> "citet" - SuppressAuthor -> "citeyearpar" - NormalCitation -> "citep" - -citationsToNatbib cits - | noPrefix (tail cits) && noSuffix (init cits) && ismode NormalCitation cits - = citeCommand "citep" p s ks - where - noPrefix = all (null . citationPrefix) - noSuffix = all (null . citationSuffix) - ismode m = all ((==) m . citationMode) - p = citationPrefix $ - head cits - s = citationSuffix $ - last cits - ks = T.intercalate ", " $ map citationId cits - -citationsToNatbib (c:cs) | citationMode c == AuthorInText = do - author <- citeCommand "citeauthor" [] [] (citationId c) - cits <- citationsToNatbib (c { citationMode = SuppressAuthor } : cs) - return $ author <+> cits - -citationsToNatbib cits = do - cits' <- mapM convertOne cits - return $ text "\\citetext{" <> foldl' combineTwo empty cits' <> text "}" - where - combineTwo a b | isEmpty a = b - | otherwise = a <> text "; " <> b - convertOne Citation { citationId = k - , citationPrefix = p - , citationSuffix = s - , citationMode = m - } - = case m of - AuthorInText -> citeCommand "citealt" p s k - SuppressAuthor -> citeCommand "citeyear" p s k - NormalCitation -> citeCommand "citealp" p s k - -citeCommand :: PandocMonad m - => Text -> [Inline] -> [Inline] -> Text -> LW m (Doc Text) -citeCommand c p s k = do - args <- citeArguments p s k - return $ literal ("\\" <> c) <> args - -type Prefix = [Inline] -type Suffix = [Inline] -type CiteId = Text -data CiteGroup = CiteGroup Prefix Suffix [CiteId] - -citeArgumentsList :: PandocMonad m - => CiteGroup -> LW m (Doc Text) -citeArgumentsList (CiteGroup _ _ []) = return empty -citeArgumentsList (CiteGroup pfxs sfxs ids) = do - pdoc <- inlineListToLaTeX pfxs - sdoc <- inlineListToLaTeX sfxs' - return $ optargs pdoc sdoc <> - braces (literal (T.intercalate "," (reverse ids))) - where sfxs' = stripLocatorBraces $ case sfxs of - (Str t : r) -> case T.uncons t of - Just (x, xs) - | T.null xs - , isPunctuation x -> dropWhile (== Space) r - | isPunctuation x -> Str xs : r - _ -> sfxs - _ -> sfxs - optargs pdoc sdoc = case (isEmpty pdoc, isEmpty sdoc) of - (True, True ) -> empty - (True, False) -> brackets sdoc - (_ , _ ) -> brackets pdoc <> brackets sdoc - -citeArguments :: PandocMonad m - => [Inline] -> [Inline] -> Text -> LW m (Doc Text) -citeArguments p s k = citeArgumentsList (CiteGroup p s [k]) - --- strip off {} used to define locator in pandoc-citeproc; see #5722 -stripLocatorBraces :: [Inline] -> [Inline] -stripLocatorBraces = walk go - where go (Str xs) = Str $ T.filter (\c -> c /= '{' && c /= '}') xs - go x = x - -citationsToBiblatex :: PandocMonad m => [Citation] -> LW m (Doc Text) -citationsToBiblatex - [one] - = citeCommand cmd p s k - where - Citation { citationId = k - , citationPrefix = p - , citationSuffix = s - , citationMode = m - } = one - cmd = case m of - SuppressAuthor -> "autocite*" - AuthorInText -> "textcite" - NormalCitation -> "autocite" - -citationsToBiblatex (c:cs) - | all (\cit -> null (citationPrefix cit) && null (citationSuffix cit)) (c:cs) - = do - let cmd = case citationMode c of - SuppressAuthor -> "\\autocite*" - AuthorInText -> "\\textcite" - NormalCitation -> "\\autocite" - return $ text cmd <> - braces (literal (T.intercalate "," (map citationId (c:cs)))) - | otherwise - = do - let cmd = case citationMode c of - SuppressAuthor -> "\\autocites*" - AuthorInText -> "\\textcites" - NormalCitation -> "\\autocites" - - groups <- mapM citeArgumentsList (reverse (foldl' grouper [] (c:cs))) - - return $ text cmd <> mconcat groups - - where grouper prev cit = case prev of - ((CiteGroup oPfx oSfx ids):rest) - | null oSfx && null pfx -> CiteGroup oPfx sfx (cid:ids) : rest - _ -> CiteGroup pfx sfx [cid] : prev - where pfx = citationPrefix cit - sfx = citationSuffix cit - cid = citationId cit - -citationsToBiblatex _ = return empty - --- Determine listings language from list of class attributes. -getListingsLanguage :: [Text] -> Maybe Text -getListingsLanguage xs - = foldr ((<|>) . toListingsLanguage) Nothing xs - -mbBraced :: Text -> Text -mbBraced x = if not (T.all isAlphaNum x) - then "{" <> x <> "}" - else x - -- Extract a key from divs and spans extract :: Text -> Block -> [Text] extract key (Div attr _) = lookKey key attr @@ -1617,175 +1050,3 @@ extractInline _ _ = [] -- Look up a key in an attribute and give a list of its values lookKey :: Text -> Attr -> [Text] lookKey key (_,_,kvs) = maybe [] T.words $ lookup key kvs - --- In environments \Arabic instead of \arabic is used -toPolyglossiaEnv :: Lang -> (Text, Text) -toPolyglossiaEnv l = - case toPolyglossia l of - ("arabic", o) -> ("Arabic", o) - x -> x - --- Takes a list of the constituents of a BCP 47 language code and --- converts it to a Polyglossia (language, options) tuple --- http://mirrors.ctan.org/macros/latex/contrib/polyglossia/polyglossia.pdf -toPolyglossia :: Lang -> (Text, Text) -toPolyglossia (Lang "ar" _ "DZ" _) = ("arabic", "locale=algeria") -toPolyglossia (Lang "ar" _ "IQ" _) = ("arabic", "locale=mashriq") -toPolyglossia (Lang "ar" _ "JO" _) = ("arabic", "locale=mashriq") -toPolyglossia (Lang "ar" _ "LB" _) = ("arabic", "locale=mashriq") -toPolyglossia (Lang "ar" _ "LY" _) = ("arabic", "locale=libya") -toPolyglossia (Lang "ar" _ "MA" _) = ("arabic", "locale=morocco") -toPolyglossia (Lang "ar" _ "MR" _) = ("arabic", "locale=mauritania") -toPolyglossia (Lang "ar" _ "PS" _) = ("arabic", "locale=mashriq") -toPolyglossia (Lang "ar" _ "SY" _) = ("arabic", "locale=mashriq") -toPolyglossia (Lang "ar" _ "TN" _) = ("arabic", "locale=tunisia") -toPolyglossia (Lang "de" _ _ vars) - | "1901" `elem` vars = ("german", "spelling=old") -toPolyglossia (Lang "de" _ "AT" vars) - | "1901" `elem` vars = ("german", "variant=austrian, spelling=old") -toPolyglossia (Lang "de" _ "AT" _) = ("german", "variant=austrian") -toPolyglossia (Lang "de" _ "CH" vars) - | "1901" `elem` vars = ("german", "variant=swiss, spelling=old") -toPolyglossia (Lang "de" _ "CH" _) = ("german", "variant=swiss") -toPolyglossia (Lang "de" _ _ _) = ("german", "") -toPolyglossia (Lang "dsb" _ _ _) = ("lsorbian", "") -toPolyglossia (Lang "el" _ "polyton" _) = ("greek", "variant=poly") -toPolyglossia (Lang "en" _ "AU" _) = ("english", "variant=australian") -toPolyglossia (Lang "en" _ "CA" _) = ("english", "variant=canadian") -toPolyglossia (Lang "en" _ "GB" _) = ("english", "variant=british") -toPolyglossia (Lang "en" _ "NZ" _) = ("english", "variant=newzealand") -toPolyglossia (Lang "en" _ "UK" _) = ("english", "variant=british") -toPolyglossia (Lang "en" _ "US" _) = ("english", "variant=american") -toPolyglossia (Lang "grc" _ _ _) = ("greek", "variant=ancient") -toPolyglossia (Lang "hsb" _ _ _) = ("usorbian", "") -toPolyglossia (Lang "la" _ _ vars) - | "x-classic" `elem` vars = ("latin", "variant=classic") -toPolyglossia (Lang "pt" _ "BR" _) = ("portuguese", "variant=brazilian") -toPolyglossia (Lang "sl" _ _ _) = ("slovenian", "") -toPolyglossia x = (commonFromBcp47 x, "") - --- Takes a list of the constituents of a BCP 47 language code and --- converts it to a Babel language string. --- http://mirrors.ctan.org/macros/latex/required/babel/base/babel.pdf --- List of supported languages (slightly outdated): --- http://tug.ctan.org/language/hyph-utf8/doc/generic/hyph-utf8/hyphenation.pdf -toBabel :: Lang -> Text -toBabel (Lang "de" _ "AT" vars) - | "1901" `elem` vars = "austrian" - | otherwise = "naustrian" -toBabel (Lang "de" _ "CH" vars) - | "1901" `elem` vars = "swissgerman" - | otherwise = "nswissgerman" -toBabel (Lang "de" _ _ vars) - | "1901" `elem` vars = "german" - | otherwise = "ngerman" -toBabel (Lang "dsb" _ _ _) = "lowersorbian" -toBabel (Lang "el" _ _ vars) - | "polyton" `elem` vars = "polutonikogreek" -toBabel (Lang "en" _ "AU" _) = "australian" -toBabel (Lang "en" _ "CA" _) = "canadian" -toBabel (Lang "en" _ "GB" _) = "british" -toBabel (Lang "en" _ "NZ" _) = "newzealand" -toBabel (Lang "en" _ "UK" _) = "british" -toBabel (Lang "en" _ "US" _) = "american" -toBabel (Lang "fr" _ "CA" _) = "canadien" -toBabel (Lang "fra" _ _ vars) - | "aca" `elem` vars = "acadian" -toBabel (Lang "grc" _ _ _) = "polutonikogreek" -toBabel (Lang "hsb" _ _ _) = "uppersorbian" -toBabel (Lang "la" _ _ vars) - | "x-classic" `elem` vars = "classiclatin" -toBabel (Lang "pt" _ "BR" _) = "brazilian" -toBabel (Lang "sl" _ _ _) = "slovene" -toBabel x = commonFromBcp47 x - --- Takes a list of the constituents of a BCP 47 language code --- and converts it to a string shared by Babel and Polyglossia. --- https://tools.ietf.org/html/bcp47#section-2.1 -commonFromBcp47 :: Lang -> Text -commonFromBcp47 (Lang "sr" "Cyrl" _ _) = "serbianc" -commonFromBcp47 (Lang "zh" "Latn" _ vars) - | "pinyin" `elem` vars = "pinyin" -commonFromBcp47 (Lang l _ _ _) = fromIso l - where - fromIso "af" = "afrikaans" - fromIso "am" = "amharic" - fromIso "ar" = "arabic" - fromIso "as" = "assamese" - fromIso "ast" = "asturian" - fromIso "bg" = "bulgarian" - fromIso "bn" = "bengali" - fromIso "bo" = "tibetan" - fromIso "br" = "breton" - fromIso "ca" = "catalan" - fromIso "cy" = "welsh" - fromIso "cs" = "czech" - fromIso "cop" = "coptic" - fromIso "da" = "danish" - fromIso "dv" = "divehi" - fromIso "el" = "greek" - fromIso "en" = "english" - fromIso "eo" = "esperanto" - fromIso "es" = "spanish" - fromIso "et" = "estonian" - fromIso "eu" = "basque" - fromIso "fa" = "farsi" - fromIso "fi" = "finnish" - fromIso "fr" = "french" - fromIso "fur" = "friulan" - fromIso "ga" = "irish" - fromIso "gd" = "scottish" - fromIso "gez" = "ethiopic" - fromIso "gl" = "galician" - fromIso "he" = "hebrew" - fromIso "hi" = "hindi" - fromIso "hr" = "croatian" - fromIso "hu" = "magyar" - fromIso "hy" = "armenian" - fromIso "ia" = "interlingua" - fromIso "id" = "indonesian" - fromIso "ie" = "interlingua" - fromIso "is" = "icelandic" - fromIso "it" = "italian" - fromIso "jp" = "japanese" - fromIso "km" = "khmer" - fromIso "kmr" = "kurmanji" - fromIso "kn" = "kannada" - fromIso "ko" = "korean" - fromIso "la" = "latin" - fromIso "lo" = "lao" - fromIso "lt" = "lithuanian" - fromIso "lv" = "latvian" - fromIso "ml" = "malayalam" - fromIso "mn" = "mongolian" - fromIso "mr" = "marathi" - fromIso "nb" = "norsk" - fromIso "nl" = "dutch" - fromIso "nn" = "nynorsk" - fromIso "no" = "norsk" - fromIso "nqo" = "nko" - fromIso "oc" = "occitan" - fromIso "pa" = "panjabi" - fromIso "pl" = "polish" - fromIso "pms" = "piedmontese" - fromIso "pt" = "portuguese" - fromIso "rm" = "romansh" - fromIso "ro" = "romanian" - fromIso "ru" = "russian" - fromIso "sa" = "sanskrit" - fromIso "se" = "samin" - fromIso "sk" = "slovak" - fromIso "sq" = "albanian" - fromIso "sr" = "serbian" - fromIso "sv" = "swedish" - fromIso "syr" = "syriac" - fromIso "ta" = "tamil" - fromIso "te" = "telugu" - fromIso "th" = "thai" - fromIso "ti" = "ethiopic" - fromIso "tk" = "turkmen" - fromIso "tr" = "turkish" - fromIso "uk" = "ukrainian" - fromIso "ur" = "urdu" - fromIso "vi" = "vietnamese" - fromIso _ = "" diff --git a/src/Text/Pandoc/Writers/LaTeX/Caption.hs b/src/Text/Pandoc/Writers/LaTeX/Caption.hs new file mode 100644 index 000000000..ab4d365cc --- /dev/null +++ b/src/Text/Pandoc/Writers/LaTeX/Caption.hs @@ -0,0 +1,47 @@ +{- | + Module : Text.Pandoc.Writers.LaTeX.Caption + Copyright : Copyright (C) 2006-2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Write figure or table captions as LaTeX. +-} +module Text.Pandoc.Writers.LaTeX.Caption + ( getCaption + ) where + +import Control.Monad.State.Strict +import Data.Monoid (Any(..)) +import Data.Text (Text) +import Text.Pandoc.Class.PandocMonad (PandocMonad) +import Text.Pandoc.Definition +import Text.DocLayout (Doc, brackets, empty) +import Text.Pandoc.Shared +import Text.Pandoc.Walk +import Text.Pandoc.Writers.LaTeX.Notes (notesToLaTeX) +import Text.Pandoc.Writers.LaTeX.Types + ( LW, WriterState (stExternalNotes, stNotes) ) + +getCaption :: PandocMonad m + => ([Inline] -> LW m (Doc Text)) + -> Bool -> [Inline] + -> LW m (Doc Text, Doc Text, Doc Text) +getCaption inlineListToLaTeX externalNotes txt = do + oldExternalNotes <- gets stExternalNotes + modify $ \st -> st{ stExternalNotes = externalNotes, stNotes = [] } + capt <- inlineListToLaTeX txt + footnotes <- if externalNotes + then notesToLaTeX <$> gets stNotes + else return empty + modify $ \st -> st{ stExternalNotes = oldExternalNotes, stNotes = [] } + -- We can't have footnotes in the list of figures/tables, so remove them: + let getNote (Note _) = Any True + getNote _ = Any False + let hasNotes = getAny . query getNote + captForLof <- if hasNotes txt + then brackets <$> inlineListToLaTeX (walk deNote txt) + else return empty + return (capt, captForLof, footnotes) diff --git a/src/Text/Pandoc/Writers/LaTeX/Citation.hs b/src/Text/Pandoc/Writers/LaTeX/Citation.hs new file mode 100644 index 000000000..f48a43d7a --- /dev/null +++ b/src/Text/Pandoc/Writers/LaTeX/Citation.hs @@ -0,0 +1,181 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Writers.LaTeX.Citation + Copyright : Copyright (C) 2006-2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable +-} +module Text.Pandoc.Writers.LaTeX.Citation + ( citationsToNatbib, + citationsToBiblatex + ) where + +import Data.Text (Text) +import Data.Char (isPunctuation) +import qualified Data.Text as T +import Text.Pandoc.Class.PandocMonad (PandocMonad) +import Text.Pandoc.Definition +import Data.List (foldl') +import Text.DocLayout (Doc, brackets, empty, (<+>), text, isEmpty, literal, + braces) +import Text.Pandoc.Walk +import Text.Pandoc.Writers.LaTeX.Types ( LW ) + +citationsToNatbib :: PandocMonad m + => ([Inline] -> LW m (Doc Text)) + -> [Citation] + -> LW m (Doc Text) +citationsToNatbib inlineListToLaTeX [one] + = citeCommand inlineListToLaTeX c p s k + where + Citation { citationId = k + , citationPrefix = p + , citationSuffix = s + , citationMode = m + } + = one + c = case m of + AuthorInText -> "citet" + SuppressAuthor -> "citeyearpar" + NormalCitation -> "citep" + +citationsToNatbib inlineListToLaTeX cits + | noPrefix (tail cits) && noSuffix (init cits) && ismode NormalCitation cits + = citeCommand inlineListToLaTeX "citep" p s ks + where + noPrefix = all (null . citationPrefix) + noSuffix = all (null . citationSuffix) + ismode m = all ((==) m . citationMode) + p = citationPrefix $ + head cits + s = citationSuffix $ + last cits + ks = T.intercalate ", " $ map citationId cits + +citationsToNatbib inlineListToLaTeX (c:cs) + | citationMode c == AuthorInText = do + author <- citeCommand inlineListToLaTeX "citeauthor" [] [] (citationId c) + cits <- citationsToNatbib inlineListToLaTeX + (c { citationMode = SuppressAuthor } : cs) + return $ author <+> cits + +citationsToNatbib inlineListToLaTeX cits = do + cits' <- mapM convertOne cits + return $ text "\\citetext{" <> foldl' combineTwo empty cits' <> text "}" + where + citeCommand' = citeCommand inlineListToLaTeX + combineTwo a b | isEmpty a = b + | otherwise = a <> text "; " <> b + convertOne Citation { citationId = k + , citationPrefix = p + , citationSuffix = s + , citationMode = m + } + = case m of + AuthorInText -> citeCommand' "citealt" p s k + SuppressAuthor -> citeCommand' "citeyear" p s k + NormalCitation -> citeCommand' "citealp" p s k + +citeCommand :: PandocMonad m + => ([Inline] -> LW m (Doc Text)) + -> Text + -> [Inline] + -> [Inline] + -> Text + -> LW m (Doc Text) +citeCommand inlineListToLaTeX c p s k = do + args <- citeArguments inlineListToLaTeX p s k + return $ literal ("\\" <> c) <> args + +type Prefix = [Inline] +type Suffix = [Inline] +type CiteId = Text +data CiteGroup = CiteGroup Prefix Suffix [CiteId] + +citeArgumentsList :: PandocMonad m + => ([Inline] -> LW m (Doc Text)) + -> CiteGroup + -> LW m (Doc Text) +citeArgumentsList _inlineListToLaTeX (CiteGroup _ _ []) = return empty +citeArgumentsList inlineListToLaTeX (CiteGroup pfxs sfxs ids) = do + pdoc <- inlineListToLaTeX pfxs + sdoc <- inlineListToLaTeX sfxs' + return $ optargs pdoc sdoc <> + braces (literal (T.intercalate "," (reverse ids))) + where sfxs' = stripLocatorBraces $ case sfxs of + (Str t : r) -> case T.uncons t of + Just (x, xs) + | T.null xs + , isPunctuation x -> dropWhile (== Space) r + | isPunctuation x -> Str xs : r + _ -> sfxs + _ -> sfxs + optargs pdoc sdoc = case (isEmpty pdoc, isEmpty sdoc) of + (True, True ) -> empty + (True, False) -> brackets sdoc + (_ , _ ) -> brackets pdoc <> brackets sdoc + +citeArguments :: PandocMonad m + => ([Inline] -> LW m (Doc Text)) + -> [Inline] + -> [Inline] + -> Text + -> LW m (Doc Text) +citeArguments inlineListToLaTeX p s k = + citeArgumentsList inlineListToLaTeX (CiteGroup p s [k]) + +-- strip off {} used to define locator in pandoc-citeproc; see #5722 +stripLocatorBraces :: [Inline] -> [Inline] +stripLocatorBraces = walk go + where go (Str xs) = Str $ T.filter (\c -> c /= '{' && c /= '}') xs + go x = x + +citationsToBiblatex :: PandocMonad m + => ([Inline] -> LW m (Doc Text)) + -> [Citation] -> LW m (Doc Text) +citationsToBiblatex inlineListToLaTeX [one] + = citeCommand inlineListToLaTeX cmd p s k + where + Citation { citationId = k + , citationPrefix = p + , citationSuffix = s + , citationMode = m + } = one + cmd = case m of + SuppressAuthor -> "autocite*" + AuthorInText -> "textcite" + NormalCitation -> "autocite" + +citationsToBiblatex inlineListToLaTeX (c:cs) + | all (\cit -> null (citationPrefix cit) && null (citationSuffix cit)) (c:cs) + = do + let cmd = case citationMode c of + SuppressAuthor -> "\\autocite*" + AuthorInText -> "\\textcite" + NormalCitation -> "\\autocite" + return $ text cmd <> + braces (literal (T.intercalate "," (map citationId (c:cs)))) + | otherwise + = do + let cmd = case citationMode c of + SuppressAuthor -> "\\autocites*" + AuthorInText -> "\\textcites" + NormalCitation -> "\\autocites" + + groups <- mapM (citeArgumentsList inlineListToLaTeX) + (reverse (foldl' grouper [] (c:cs))) + + return $ text cmd <> mconcat groups + + where grouper prev cit = case prev of + ((CiteGroup oPfx oSfx ids):rest) + | null oSfx && null pfx -> CiteGroup oPfx sfx (cid:ids) : rest + _ -> CiteGroup pfx sfx [cid] : prev + where pfx = citationPrefix cit + sfx = citationSuffix cit + cid = citationId cit + +citationsToBiblatex _ _ = return empty diff --git a/src/Text/Pandoc/Writers/LaTeX/Lang.hs b/src/Text/Pandoc/Writers/LaTeX/Lang.hs new file mode 100644 index 000000000..0ba68b74e --- /dev/null +++ b/src/Text/Pandoc/Writers/LaTeX/Lang.hs @@ -0,0 +1,192 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{- | + Module : Text.Pandoc.Writers.LaTeX.Lang + Copyright : Copyright (C) 2006-2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable +-} +module Text.Pandoc.Writers.LaTeX.Lang + ( toPolyglossiaEnv, + toPolyglossia, + toBabel + ) where +import Data.Text (Text) +import Text.Collate.Lang (Lang(..)) + + +-- In environments \Arabic instead of \arabic is used +toPolyglossiaEnv :: Lang -> (Text, Text) +toPolyglossiaEnv l = + case toPolyglossia l of + ("arabic", o) -> ("Arabic", o) + x -> x + +-- Takes a list of the constituents of a BCP47 language code and +-- converts it to a Polyglossia (language, options) tuple +-- http://mirrors.ctan.org/macros/latex/contrib/polyglossia/polyglossia.pdf +toPolyglossia :: Lang -> (Text, Text) +toPolyglossia (Lang "ar" _ (Just "DZ") _ _ _) = ("arabic", "locale=algeria") +toPolyglossia (Lang "ar" _ (Just "IQ") _ _ _) = ("arabic", "locale=mashriq") +toPolyglossia (Lang "ar" _ (Just "JO") _ _ _) = ("arabic", "locale=mashriq") +toPolyglossia (Lang "ar" _ (Just "LB") _ _ _) = ("arabic", "locale=mashriq") +toPolyglossia (Lang "ar" _ (Just "LY") _ _ _) = ("arabic", "locale=libya") +toPolyglossia (Lang "ar" _ (Just "MA") _ _ _) = ("arabic", "locale=morocco") +toPolyglossia (Lang "ar" _ (Just "MR") _ _ _) = ("arabic", "locale=mauritania") +toPolyglossia (Lang "ar" _ (Just "PS") _ _ _) = ("arabic", "locale=mashriq") +toPolyglossia (Lang "ar" _ (Just "SY") _ _ _) = ("arabic", "locale=mashriq") +toPolyglossia (Lang "ar" _ (Just "TN") _ _ _) = ("arabic", "locale=tunisia") +toPolyglossia (Lang "de" _ _ vars _ _) + | "1901" `elem` vars = ("german", "spelling=old") +toPolyglossia (Lang "de" _ (Just "AT") vars _ _) + | "1901" `elem` vars = ("german", "variant=austrian, spelling=old") +toPolyglossia (Lang "de" _ (Just "AT") _ _ _) = ("german", "variant=austrian") +toPolyglossia (Lang "de" _ (Just "CH") vars _ _) + | "1901" `elem` vars = ("german", "variant=swiss, spelling=old") +toPolyglossia (Lang "de" _ (Just "CH") _ _ _) = ("german", "variant=swiss") +toPolyglossia (Lang "de" _ _ _ _ _) = ("german", "") +toPolyglossia (Lang "dsb" _ _ _ _ _) = ("lsorbian", "") +toPolyglossia (Lang "el" _ _ vars _ _) + | "polyton" `elem` vars = ("greek", "variant=poly") +toPolyglossia (Lang "en" _ (Just "AU") _ _ _) = ("english", "variant=australian") +toPolyglossia (Lang "en" _ (Just "CA") _ _ _) = ("english", "variant=canadian") +toPolyglossia (Lang "en" _ (Just "GB") _ _ _) = ("english", "variant=british") +toPolyglossia (Lang "en" _ (Just "NZ") _ _ _) = ("english", "variant=newzealand") +toPolyglossia (Lang "en" _ (Just "UK") _ _ _) = ("english", "variant=british") +toPolyglossia (Lang "en" _ (Just "US") _ _ _) = ("english", "variant=american") +toPolyglossia (Lang "grc" _ _ _ _ _) = ("greek", "variant=ancient") +toPolyglossia (Lang "hsb" _ _ _ _ _) = ("usorbian", "") +toPolyglossia (Lang "la" _ _ vars _ _) + | "x-classic" `elem` vars = ("latin", "variant=classic") +toPolyglossia (Lang "pt" _ (Just "BR") _ _ _) = ("portuguese", "variant=brazilian") +toPolyglossia (Lang "sl" _ _ _ _ _) = ("slovenian", "") +toPolyglossia x = (commonFromBcp47 x, "") + +-- Takes a list of the constituents of a BCP47 language code and +-- converts it to a Babel language string. +-- http://mirrors.ctan.org/macros/latex/required/babel/base/babel.pdf +-- List of supported languages (slightly outdated): +-- http://tug.ctan.org/language/hyph-utf8/doc/generic/hyph-utf8/hyphenation.pdf +toBabel :: Lang -> Text +toBabel (Lang "de" _ (Just "AT") vars _ _) + | "1901" `elem` vars = "austrian" + | otherwise = "naustrian" +toBabel (Lang "de" _ (Just "CH") vars _ _) + | "1901" `elem` vars = "swissgerman" + | otherwise = "nswissgerman" +toBabel (Lang "de" _ _ vars _ _) + | "1901" `elem` vars = "german" + | otherwise = "ngerman" +toBabel (Lang "dsb" _ _ _ _ _) = "lowersorbian" +toBabel (Lang "el" _ _ vars _ _) + | "polyton" `elem` vars = "polutonikogreek" +toBabel (Lang "en" _ (Just "AU") _ _ _) = "australian" +toBabel (Lang "en" _ (Just "CA") _ _ _) = "canadian" +toBabel (Lang "en" _ (Just "GB") _ _ _) = "british" +toBabel (Lang "en" _ (Just "NZ") _ _ _) = "newzealand" +toBabel (Lang "en" _ (Just "UK") _ _ _) = "british" +toBabel (Lang "en" _ (Just "US") _ _ _) = "american" +toBabel (Lang "fr" _ (Just "CA") _ _ _) = "canadien" +toBabel (Lang "fra" _ _ vars _ _) + | "aca" `elem` vars = "acadian" +toBabel (Lang "grc" _ _ _ _ _) = "polutonikogreek" +toBabel (Lang "hsb" _ _ _ _ _) = "uppersorbian" +toBabel (Lang "la" _ _ vars _ _) + | "x-classic" `elem` vars = "classiclatin" +toBabel (Lang "pt" _ (Just "BR") _ _ _) = "brazilian" +toBabel (Lang "sl" _ _ _ _ _) = "slovene" +toBabel x = commonFromBcp47 x + +-- Takes a list of the constituents of a BCP47 language code +-- and converts it to a string shared by Babel and Polyglossia. +-- https://tools.ietf.org/html/bcp47#section-2.1 +commonFromBcp47 :: Lang -> Text +commonFromBcp47 (Lang "sr" (Just "Cyrl") _ _ _ _) = "serbianc" +commonFromBcp47 (Lang "zh" (Just "Latn") _ vars _ _) + | "pinyin" `elem` vars = "pinyin" +commonFromBcp47 (Lang l _ _ _ _ _) = fromIso l + where + fromIso "af" = "afrikaans" + fromIso "am" = "amharic" + fromIso "ar" = "arabic" + fromIso "as" = "assamese" + fromIso "ast" = "asturian" + fromIso "bg" = "bulgarian" + fromIso "bn" = "bengali" + fromIso "bo" = "tibetan" + fromIso "br" = "breton" + fromIso "ca" = "catalan" + fromIso "cy" = "welsh" + fromIso "cs" = "czech" + fromIso "cop" = "coptic" + fromIso "da" = "danish" + fromIso "dv" = "divehi" + fromIso "el" = "greek" + fromIso "en" = "english" + fromIso "eo" = "esperanto" + fromIso "es" = "spanish" + fromIso "et" = "estonian" + fromIso "eu" = "basque" + fromIso "fa" = "farsi" + fromIso "fi" = "finnish" + fromIso "fr" = "french" + fromIso "fur" = "friulan" + fromIso "ga" = "irish" + fromIso "gd" = "scottish" + fromIso "gez" = "ethiopic" + fromIso "gl" = "galician" + fromIso "he" = "hebrew" + fromIso "hi" = "hindi" + fromIso "hr" = "croatian" + fromIso "hu" = "magyar" + fromIso "hy" = "armenian" + fromIso "ia" = "interlingua" + fromIso "id" = "indonesian" + fromIso "ie" = "interlingua" + fromIso "is" = "icelandic" + fromIso "it" = "italian" + fromIso "ja" = "japanese" + fromIso "km" = "khmer" + fromIso "kmr" = "kurmanji" + fromIso "kn" = "kannada" + fromIso "ko" = "korean" + fromIso "la" = "latin" + fromIso "lo" = "lao" + fromIso "lt" = "lithuanian" + fromIso "lv" = "latvian" + fromIso "ml" = "malayalam" + fromIso "mn" = "mongolian" + fromIso "mr" = "marathi" + fromIso "nb" = "norsk" + fromIso "nl" = "dutch" + fromIso "nn" = "nynorsk" + fromIso "no" = "norsk" + fromIso "nqo" = "nko" + fromIso "oc" = "occitan" + fromIso "pa" = "panjabi" + fromIso "pl" = "polish" + fromIso "pms" = "piedmontese" + fromIso "pt" = "portuguese" + fromIso "rm" = "romansh" + fromIso "ro" = "romanian" + fromIso "ru" = "russian" + fromIso "sa" = "sanskrit" + fromIso "se" = "samin" + fromIso "sk" = "slovak" + fromIso "sq" = "albanian" + fromIso "sr" = "serbian" + fromIso "sv" = "swedish" + fromIso "syr" = "syriac" + fromIso "ta" = "tamil" + fromIso "te" = "telugu" + fromIso "th" = "thai" + fromIso "ti" = "ethiopic" + fromIso "tk" = "turkmen" + fromIso "tr" = "turkish" + fromIso "uk" = "ukrainian" + fromIso "ur" = "urdu" + fromIso "vi" = "vietnamese" + fromIso _ = "" diff --git a/src/Text/Pandoc/Writers/LaTeX/Notes.hs b/src/Text/Pandoc/Writers/LaTeX/Notes.hs new file mode 100644 index 000000000..f225ef0c5 --- /dev/null +++ b/src/Text/Pandoc/Writers/LaTeX/Notes.hs @@ -0,0 +1,34 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Writers.LaTeX.Notes + Copyright : Copyright (C) 2006-2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Output tables as LaTeX. +-} +module Text.Pandoc.Writers.LaTeX.Notes + ( notesToLaTeX + ) where + +import Data.List (intersperse) +import Text.DocLayout ( Doc, braces, empty, text, vcat, ($$)) +import Data.Text (Text) + +notesToLaTeX :: [Doc Text] -> Doc Text +notesToLaTeX = \case + [] -> empty + ns -> (case length ns of + n | n > 1 -> "\\addtocounter" <> + braces "footnote" <> + braces (text $ show $ 1 - n) + | otherwise -> empty) + $$ + vcat (intersperse + ("\\addtocounter" <> braces "footnote" <> braces "1") + $ map (\x -> "\\footnotetext" <> braces x) + $ reverse ns) diff --git a/src/Text/Pandoc/Writers/LaTeX/Table.hs b/src/Text/Pandoc/Writers/LaTeX/Table.hs new file mode 100644 index 000000000..27a8a0257 --- /dev/null +++ b/src/Text/Pandoc/Writers/LaTeX/Table.hs @@ -0,0 +1,307 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Writers.LaTeX.Table + Copyright : Copyright (C) 2006-2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Output LaTeX formatted tables. +-} +module Text.Pandoc.Writers.LaTeX.Table + ( tableToLaTeX + ) where +import Control.Monad.State.Strict +import Data.List (intersperse) +import qualified Data.List.NonEmpty as NonEmpty +import Data.List.NonEmpty (NonEmpty ((:|))) +import Data.Text (Text) +import qualified Data.Text as T +import Text.Pandoc.Class.PandocMonad (PandocMonad) +import Text.Pandoc.Definition +import Text.DocLayout + ( Doc, braces, cr, empty, hcat, hsep, isEmpty, literal, nest + , text, vcat, ($$) ) +import Text.Pandoc.Shared (blocksToInlines, splitBy, tshow) +import Text.Pandoc.Walk (walk, query) +import Data.Monoid (Any(..)) +import Text.Pandoc.Writers.LaTeX.Caption (getCaption) +import Text.Pandoc.Writers.LaTeX.Notes (notesToLaTeX) +import Text.Pandoc.Writers.LaTeX.Types + ( LW, WriterState (stBeamer, stExternalNotes, stInMinipage, stMultiRow + , stNotes, stTable) ) +import Text.Printf (printf) +import qualified Text.Pandoc.Builder as B +import qualified Text.Pandoc.Writers.AnnotatedTable as Ann + +tableToLaTeX :: PandocMonad m + => ([Inline] -> LW m (Doc Text)) + -> ([Block] -> LW m (Doc Text)) + -> Ann.Table + -> LW m (Doc Text) +tableToLaTeX inlnsToLaTeX blksToLaTeX tbl = do + let (Ann.Table _attr caption _specs thead tbodies tfoot) = tbl + CaptionDocs capt captNotes <- captionToLaTeX inlnsToLaTeX caption + let removeNote (Note _) = Span ("", [], []) [] + removeNote x = x + firsthead <- if isEmpty capt || isEmptyHead thead + then return empty + else ($$ text "\\endfirsthead") <$> + headToLaTeX blksToLaTeX thead + head' <- if isEmptyHead thead + then return "\\toprule" + -- avoid duplicate notes in head and firsthead: + else headToLaTeX blksToLaTeX + (if isEmpty firsthead + then thead + else walk removeNote thead) + rows' <- mapM (rowToLaTeX blksToLaTeX BodyCell) $ + mconcat (map bodyRows tbodies) <> footRows tfoot + modify $ \s -> s{ stTable = True } + notes <- notesToLaTeX <$> gets stNotes + return + $ "\\begin{longtable}[]" <> + braces ("@{}" <> colDescriptors tbl <> "@{}") + -- the @{} removes extra space at beginning and end + $$ capt + $$ firsthead + $$ head' + $$ "\\endhead" + $$ vcat rows' + $$ "\\bottomrule" + $$ "\\end{longtable}" + $$ captNotes + $$ notes + +-- | Creates column descriptors for the table. +colDescriptors :: Ann.Table -> Doc Text +colDescriptors (Ann.Table _attr _caption specs thead tbodies tfoot) = + let (aligns, widths) = unzip specs + + defaultWidthsOnly = all (== ColWidthDefault) widths + isSimpleTable = all (all isSimpleCell) $ mconcat + [ headRows thead + , concatMap bodyRows tbodies + , footRows tfoot + ] + + relativeWidths = if defaultWidthsOnly + then replicate (length specs) + (1 / fromIntegral (length specs)) + else map toRelWidth widths + in if defaultWidthsOnly && isSimpleTable + then hcat $ map (literal . colAlign) aligns + else (cr <>) . nest 2 . vcat . map literal $ + zipWith (toColDescriptor (length specs)) + aligns + relativeWidths + where + toColDescriptor :: Int -> Alignment -> Double -> Text + toColDescriptor numcols align width = + T.pack $ printf + ">{%s\\arraybackslash}p{(\\columnwidth - %d\\tabcolsep) * \\real{%0.2f}}" + (T.unpack (alignCommand align)) + ((numcols - 1) * 2) + width + + isSimpleCell (Ann.Cell _ _ (Cell _attr _align _rowspan _colspan blocks)) = + case blocks of + [Para _] -> True + [Plain _] -> True + [] -> True + _ -> False + + toRelWidth ColWidthDefault = 0 + toRelWidth (ColWidth w) = w + +alignCommand :: Alignment -> Text +alignCommand = \case + AlignLeft -> "\\raggedright" + AlignRight -> "\\raggedleft" + AlignCenter -> "\\centering" + AlignDefault -> "\\raggedright" + +colAlign :: Alignment -> Text +colAlign = \case + AlignLeft -> "l" + AlignRight -> "r" + AlignCenter -> "c" + AlignDefault -> "l" + +data CaptionDocs = + CaptionDocs + { captionCommand :: Doc Text + , captionNotes :: Doc Text + } + +captionToLaTeX :: PandocMonad m + => ([Inline] -> LW m (Doc Text)) + -> Caption + -> LW m CaptionDocs +captionToLaTeX inlnsToLaTeX (Caption _maybeShort longCaption) = do + let caption = blocksToInlines longCaption + (captionText, captForLof, captNotes) <- getCaption inlnsToLaTeX False caption + return $ CaptionDocs + { captionNotes = captNotes + , captionCommand = if isEmpty captionText + then empty + else "\\caption" <> captForLof <> + braces captionText <> "\\tabularnewline" + } + +type BlocksWriter m = [Block] -> LW m (Doc Text) + +headToLaTeX :: PandocMonad m + => BlocksWriter m + -> Ann.TableHead + -> LW m (Doc Text) +headToLaTeX blocksWriter (Ann.TableHead _attr headerRows) = do + rowsContents <- mapM (rowToLaTeX blocksWriter HeaderCell . headerRowCells) + headerRows + return ("\\toprule" $$ vcat rowsContents $$ "\\midrule") + +-- | Converts a row of table cells into a LaTeX row. +rowToLaTeX :: PandocMonad m + => BlocksWriter m + -> CellType + -> [Ann.Cell] + -> LW m (Doc Text) +rowToLaTeX blocksWriter celltype row = do + cellsDocs <- mapM (cellToLaTeX blocksWriter celltype) (fillRow row) + return $ hsep (intersperse "&" cellsDocs) <> " \\\\" + +-- | Pads row with empty cells to adjust for rowspans above this row. +fillRow :: [Ann.Cell] -> [Ann.Cell] +fillRow = go 0 + where + go _ [] = [] + go n (acell@(Ann.Cell _spec (Ann.ColNumber colnum) cell):cells) = + let (Cell _ _ _ (ColSpan colspan) _) = cell + in map mkEmptyCell [n .. colnum - 1] ++ + acell : go (colnum + colspan) cells + + mkEmptyCell :: Int -> Ann.Cell + mkEmptyCell colnum = + Ann.Cell ((AlignDefault, ColWidthDefault):|[]) + (Ann.ColNumber colnum) + B.emptyCell + +isEmptyHead :: Ann.TableHead -> Bool +isEmptyHead (Ann.TableHead _attr []) = True +isEmptyHead (Ann.TableHead _attr rows) = all (null . headerRowCells) rows + +-- | Gets all cells in a header row. +headerRowCells :: Ann.HeaderRow -> [Ann.Cell] +headerRowCells (Ann.HeaderRow _attr _rownum cells) = cells + +-- | Gets all cells in a body row. +bodyRowCells :: Ann.BodyRow -> [Ann.Cell] +bodyRowCells (Ann.BodyRow _attr _rownum rowhead cells) = rowhead <> cells + +-- | Gets a list of rows of the table body, where a row is a simple +-- list of cells. +bodyRows :: Ann.TableBody -> [[Ann.Cell]] +bodyRows (Ann.TableBody _attr _rowheads headerRows rows) = + map headerRowCells headerRows <> map bodyRowCells rows + +-- | Gets a list of rows of the table head, where a row is a simple +-- list of cells. +headRows :: Ann.TableHead -> [[Ann.Cell]] +headRows (Ann.TableHead _attr rows) = map headerRowCells rows + +-- | Gets a list of rows from the foot, where a row is a simple list +-- of cells. +footRows :: Ann.TableFoot -> [[Ann.Cell]] +footRows (Ann.TableFoot _attr rows) = map headerRowCells rows + +-- For simple latex tables (without minipages or parboxes), +-- we need to go to some lengths to get line breaks working: +-- as LineBreak bs = \vtop{\hbox{\strut as}\hbox{\strut bs}}. +fixLineBreaks :: Block -> Block +fixLineBreaks = walk fixLineBreaks' + +fixLineBreaks' :: [Inline] -> [Inline] +fixLineBreaks' ils = case splitBy (== LineBreak) ils of + [] -> [] + [xs] -> xs + chunks -> RawInline "tex" "\\vtop{" : + concatMap tohbox chunks <> + [RawInline "tex" "}"] + where tohbox ys = RawInline "tex" "\\hbox{\\strut " : ys <> + [RawInline "tex" "}"] + +-- We also change display math to inline math, since display +-- math breaks in simple tables. +displayMathToInline :: Inline -> Inline +displayMathToInline (Math DisplayMath x) = Math InlineMath x +displayMathToInline x = x + +cellToLaTeX :: PandocMonad m + => BlocksWriter m + -> CellType + -> Ann.Cell + -> LW m (Doc Text) +cellToLaTeX blockListToLaTeX celltype annotatedCell = do + let (Ann.Cell specs _colnum cell) = annotatedCell + let hasWidths = snd (NonEmpty.head specs) /= ColWidthDefault + let specAlign = fst (NonEmpty.head specs) + let (Cell _attr align' rowspan colspan blocks) = cell + let align = case align' of + AlignDefault -> specAlign + _ -> align' + beamer <- gets stBeamer + externalNotes <- gets stExternalNotes + inMinipage <- gets stInMinipage + -- See #5367 -- footnotehyper/footnote don't work in beamer, + -- so we need to produce the notes outside the table... + modify $ \st -> st{ stExternalNotes = beamer } + let isPlainOrPara = \case + Para{} -> True + Plain{} -> True + _ -> False + let hasLineBreak LineBreak = Any True + hasLineBreak _ = Any False + let hasLineBreaks = getAny $ query hasLineBreak blocks + result <- + if not hasWidths || (celltype /= HeaderCell + && all isPlainOrPara blocks + && not hasLineBreaks) + then + blockListToLaTeX $ walk fixLineBreaks $ walk displayMathToInline blocks + else do + modify $ \st -> st{ stInMinipage = True } + cellContents <- blockListToLaTeX blocks + modify $ \st -> st{ stInMinipage = inMinipage } + let valign = text $ case celltype of + HeaderCell -> "[b]" + BodyCell -> "[t]" + let halign = literal $ alignCommand align + return $ "\\begin{minipage}" <> valign <> + braces "\\linewidth" <> halign <> cr <> + cellContents <> + (if hasLineBreaks then "\\strut" else mempty) + <> cr <> + "\\end{minipage}" + modify $ \st -> st{ stExternalNotes = externalNotes } + when (rowspan /= RowSpan 1) $ + modify (\st -> st{ stMultiRow = True }) + let inMultiColumn x = case colspan of + (ColSpan 1) -> x + (ColSpan n) -> "\\multicolumn" + <> braces (literal (tshow n)) + <> braces (literal $ colAlign align) + <> braces x + let inMultiRow x = case rowspan of + (RowSpan 1) -> x + (RowSpan n) -> let nrows = literal (tshow n) + in "\\multirow" <> braces nrows + <> braces "*" <> braces x + return . inMultiColumn . inMultiRow $ result + +data CellType + = HeaderCell + | BodyCell + deriving Eq diff --git a/src/Text/Pandoc/Writers/LaTeX/Types.hs b/src/Text/Pandoc/Writers/LaTeX/Types.hs new file mode 100644 index 000000000..c06b7e923 --- /dev/null +++ b/src/Text/Pandoc/Writers/LaTeX/Types.hs @@ -0,0 +1,83 @@ +module Text.Pandoc.Writers.LaTeX.Types + ( LW + , WriterState (..) + , startingState + ) where + +import Control.Monad.State.Strict (StateT) +import Data.Text (Text) +import Text.DocLayout (Doc) +import Text.Pandoc.Options + ( WriterOptions (writerIncremental, writerTopLevelDivision) + , TopLevelDivision (..) + ) + +-- | LaTeX writer type. The type constructor @m@ will typically be an +-- instance of PandocMonad. +type LW m = StateT WriterState m + +data WriterState = + WriterState + { stInNote :: Bool -- ^ true if we're in a note + , stInQuote :: Bool -- ^ true if in a blockquote + , stExternalNotes :: Bool -- ^ true if in context where + -- we need to store footnotes + , stInMinipage :: Bool -- ^ true if in minipage + , stInHeading :: Bool -- ^ true if in a section heading + , stInItem :: Bool -- ^ true if in \item[..] + , stNotes :: [Doc Text] -- ^ notes in a minipage + , stOLLevel :: Int -- ^ level of ordered list nesting + , stOptions :: WriterOptions -- ^ writer options, so they don't have to + -- be parameter + , stVerbInNote :: Bool -- ^ true if document has verbatim text in note + , stTable :: Bool -- ^ true if document has a table + , stMultiRow :: Bool -- ^ true if document has multirow cells + , stStrikeout :: Bool -- ^ true if document has strikeout + , stUrl :: Bool -- ^ true if document has visible URL link + , stGraphics :: Bool -- ^ true if document contains images + , stLHS :: Bool -- ^ true if document has literate haskell code + , stHasChapters :: Bool -- ^ true if document has chapters + , stCsquotes :: Bool -- ^ true if document uses csquotes + , stHighlighting :: Bool -- ^ true if document has highlighted code + , stIncremental :: Bool -- ^ true if beamer lists should be + , stZwnj :: Bool -- ^ true if document has a ZWNJ character + , stInternalLinks :: [Text] -- ^ list of internal link targets + , stBeamer :: Bool -- ^ produce beamer + , stEmptyLine :: Bool -- ^ true if no content on line + , stHasCslRefs :: Bool -- ^ has a Div with class refs + , stIsFirstInDefinition :: Bool -- ^ first block in a defn list + } + +startingState :: WriterOptions -> WriterState +startingState options = + WriterState + { stInNote = False + , stInQuote = False + , stExternalNotes = False + , stInHeading = False + , stInMinipage = False + , stInItem = False + , stNotes = [] + , stOLLevel = 1 + , stOptions = options + , stVerbInNote = False + , stTable = False + , stMultiRow = False + , stStrikeout = False + , stUrl = False + , stGraphics = False + , stLHS = False + , stHasChapters = case writerTopLevelDivision options of + TopLevelPart -> True + TopLevelChapter -> True + _ -> False + , stCsquotes = False + , stHighlighting = False + , stIncremental = writerIncremental options + , stZwnj = False + , stInternalLinks = [] + , stBeamer = False + , stEmptyLine = True + , stHasCslRefs = False + , stIsFirstInDefinition = False + } diff --git a/src/Text/Pandoc/Writers/LaTeX/Util.hs b/src/Text/Pandoc/Writers/LaTeX/Util.hs new file mode 100644 index 000000000..c34338121 --- /dev/null +++ b/src/Text/Pandoc/Writers/LaTeX/Util.hs @@ -0,0 +1,275 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Writers.LaTeX.Util + Copyright : Copyright (C) 2006-2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable +-} +module Text.Pandoc.Writers.LaTeX.Util ( + stringToLaTeX + , StringContext(..) + , toLabel + , inCmd + , wrapDiv + , hypertarget + , labelFor + , getListingsLanguage + , mbBraced + ) +where + +import Control.Applicative ((<|>)) +import Control.Monad (when) +import Text.Pandoc.Class (PandocMonad, toLang) +import Text.Pandoc.Options (WriterOptions(..), isEnabled) +import Text.Pandoc.Writers.LaTeX.Types (LW, WriterState(..)) +import Text.Pandoc.Writers.LaTeX.Lang (toPolyglossiaEnv) +import Text.Pandoc.Highlighting (toListingsLanguage) +import Text.DocLayout +import Text.Pandoc.Definition +import Text.Pandoc.ImageSize (showFl) +import Control.Monad.State.Strict (gets, modify) +import Data.Text (Text) +import qualified Data.Text as T +import Text.Pandoc.Extensions (Extension(Ext_smart)) +import Data.Char (isLetter, isSpace, isDigit, isAscii, ord, isAlphaNum) +import Text.Printf (printf) +import Text.Pandoc.Shared (safeRead, elemText) +import qualified Data.Text.Normalize as Normalize +import Data.List (uncons) + +data StringContext = TextString + | URLString + | CodeString + deriving (Eq) + +-- escape things as needed for LaTeX +stringToLaTeX :: PandocMonad m => StringContext -> Text -> LW m Text +stringToLaTeX context zs = do + opts <- gets stOptions + when ('\x200c' `elemText` zs) $ + modify (\s -> s { stZwnj = True }) + return $ T.pack $ + foldr (go opts context) mempty $ T.unpack $ + if writerPreferAscii opts + then Normalize.normalize Normalize.NFD zs + else zs + where + go :: WriterOptions -> StringContext -> Char -> String -> String + go opts ctx x xs = + let ligatures = isEnabled Ext_smart opts && ctx == TextString + isUrl = ctx == URLString + mbAccentCmd = + if writerPreferAscii opts && ctx == TextString + then uncons xs >>= \(c,_) -> lookupAccent c + else Nothing + emits s = + case mbAccentCmd of + Just cmd -> + cmd <> "{" <> s <> "}" <> drop 1 xs -- drop combining accent + Nothing -> s <> xs + emitc c = + case mbAccentCmd of + Just cmd -> + cmd <> "{" <> [c] <> "}" <> drop 1 xs -- drop combining accent + Nothing -> c : xs + emitcseq cs = + case xs of + c:_ | isLetter c + , ctx == TextString + -> cs <> " " <> xs + | isSpace c -> cs <> "{}" <> xs + | ctx == TextString + -> cs <> xs + _ -> cs <> "{}" <> xs + emitquote cs = + case xs of + '`':_ -> cs <> "\\," <> xs -- add thin space + '\'':_ -> cs <> "\\," <> xs -- add thin space + _ -> cs <> xs + in case x of + '?' | ligatures -> -- avoid ?` ligature + case xs of + '`':_ -> emits "?{}" + _ -> emitc x + '!' | ligatures -> -- avoid !` ligature + case xs of + '`':_ -> emits "!{}" + _ -> emitc x + '{' -> emits "\\{" + '}' -> emits "\\}" + '`' | ctx == CodeString -> emitcseq "\\textasciigrave" + '$' | not isUrl -> emits "\\$" + '%' -> emits "\\%" + '&' -> emits "\\&" + '_' | not isUrl -> emits "\\_" + '#' -> emits "\\#" + '-' | not isUrl -> case xs of + -- prevent adjacent hyphens from forming ligatures + ('-':_) -> emits "-\\/" + _ -> emitc '-' + '~' | not isUrl -> emitcseq "\\textasciitilde" + '^' -> emits "\\^{}" + '\\'| isUrl -> emitc '/' -- NB. / works as path sep even on Windows + | otherwise -> emitcseq "\\textbackslash" + '|' | not isUrl -> emitcseq "\\textbar" + '<' -> emitcseq "\\textless" + '>' -> emitcseq "\\textgreater" + '[' -> emits "{[}" -- to avoid interpretation as + ']' -> emits "{]}" -- optional arguments + '\'' | ctx == CodeString -> emitcseq "\\textquotesingle" + '\160' -> emits "~" + '\x200B' -> emits "\\hspace{0pt}" -- zero-width space + '\x202F' -> emits "\\," + '\x2026' -> emitcseq "\\ldots" + '\x2018' | ligatures -> emitquote "`" + '\x2019' | ligatures -> emitquote "'" + '\x201C' | ligatures -> emitquote "``" + '\x201D' | ligatures -> emitquote "''" + '\x2014' | ligatures -> emits "---" + '\x2013' | ligatures -> emits "--" + _ | writerPreferAscii opts + -> case x of + 'ı' -> emitcseq "\\i" + 'ȷ' -> emitcseq "\\j" + 'å' -> emitcseq "\\aa" + 'Å' -> emitcseq "\\AA" + 'ß' -> emitcseq "\\ss" + 'ø' -> emitcseq "\\o" + 'Ø' -> emitcseq "\\O" + 'Ł' -> emitcseq "\\L" + 'ł' -> emitcseq "\\l" + 'æ' -> emitcseq "\\ae" + 'Æ' -> emitcseq "\\AE" + 'œ' -> emitcseq "\\oe" + 'Œ' -> emitcseq "\\OE" + '£' -> emitcseq "\\pounds" + '€' -> emitcseq "\\euro" + '©' -> emitcseq "\\copyright" + _ -> emitc x + | otherwise -> emitc x + +lookupAccent :: Char -> Maybe String +lookupAccent '\779' = Just "\\H" +lookupAccent '\768' = Just "\\`" +lookupAccent '\769' = Just "\\'" +lookupAccent '\770' = Just "\\^" +lookupAccent '\771' = Just "\\~" +lookupAccent '\776' = Just "\\\"" +lookupAccent '\775' = Just "\\." +lookupAccent '\772' = Just "\\=" +lookupAccent '\781' = Just "\\|" +lookupAccent '\817' = Just "\\b" +lookupAccent '\807' = Just "\\c" +lookupAccent '\783' = Just "\\G" +lookupAccent '\777' = Just "\\h" +lookupAccent '\803' = Just "\\d" +lookupAccent '\785' = Just "\\f" +lookupAccent '\778' = Just "\\r" +lookupAccent '\865' = Just "\\t" +lookupAccent '\782' = Just "\\U" +lookupAccent '\780' = Just "\\v" +lookupAccent '\774' = Just "\\u" +lookupAccent '\808' = Just "\\k" +lookupAccent '\8413' = Just "\\textcircled" +lookupAccent _ = Nothing + +toLabel :: PandocMonad m => Text -> LW m Text +toLabel z = go `fmap` stringToLaTeX URLString z + where + go = T.concatMap $ \x -> case x of + _ | (isLetter x || isDigit x) && isAscii x -> T.singleton x + | x `elemText` "_-+=:;." -> T.singleton x + | otherwise -> T.pack $ "ux" <> printf "%x" (ord x) + +-- | Puts contents into LaTeX command. +inCmd :: Text -> Doc Text -> Doc Text +inCmd cmd contents = char '\\' <> literal cmd <> braces contents + +mapAlignment :: Text -> Text +mapAlignment a = case a of + "top" -> "T" + "top-baseline" -> "t" + "bottom" -> "b" + "center" -> "c" + _ -> a + +wrapDiv :: PandocMonad m => Attr -> Doc Text -> LW m (Doc Text) +wrapDiv (_,classes,kvs) t = do + beamer <- gets stBeamer + let align dir txt = inCmd "begin" dir $$ txt $$ inCmd "end" dir + lang <- toLang $ lookup "lang" kvs + let wrapColumns = if beamer && "columns" `elem` classes + then \contents -> + let valign = maybe "T" mapAlignment (lookup "align" kvs) + totalwidth = maybe [] (\x -> ["totalwidth=" <> x]) + (lookup "totalwidth" kvs) + onlytextwidth = filter ("onlytextwidth" ==) classes + options = text $ T.unpack $ T.intercalate "," $ + valign : totalwidth ++ onlytextwidth + in inCmd "begin" "columns" <> brackets options + $$ contents + $$ inCmd "end" "columns" + else id + wrapColumn = if beamer && "column" `elem` classes + then \contents -> + let valign = + maybe "" + (brackets . text . T.unpack . mapAlignment) + (lookup "align" kvs) + w = maybe "0.48" fromPct (lookup "width" kvs) + in inCmd "begin" "column" <> + valign <> + braces (literal w <> "\\textwidth") + $$ contents + $$ inCmd "end" "column" + else id + fromPct xs = + case T.unsnoc xs of + Just (ds, '%') -> case safeRead ds of + Just digits -> showFl (digits / 100 :: Double) + Nothing -> xs + _ -> xs + wrapDir = case lookup "dir" kvs of + Just "rtl" -> align "RTL" + Just "ltr" -> align "LTR" + _ -> id + wrapLang txt = case lang of + Just lng -> let (l, o) = toPolyglossiaEnv lng + ops = if T.null o + then "" + else brackets $ literal o + in inCmd "begin" (literal l) <> ops + $$ blankline <> txt <> blankline + $$ inCmd "end" (literal l) + Nothing -> txt + return $ wrapColumns . wrapColumn . wrapDir . wrapLang $ t + +hypertarget :: PandocMonad m => Bool -> Text -> Doc Text -> LW m (Doc Text) +hypertarget _ "" x = return x +hypertarget addnewline ident x = do + ref <- literal `fmap` toLabel ident + return $ text "\\hypertarget" + <> braces ref + <> braces ((if addnewline && not (isEmpty x) + then "%" <> cr + else empty) <> x) + +labelFor :: PandocMonad m => Text -> LW m (Doc Text) +labelFor "" = return empty +labelFor ident = do + ref <- literal `fmap` toLabel ident + return $ text "\\label" <> braces ref + +-- Determine listings language from list of class attributes. +getListingsLanguage :: [Text] -> Maybe Text +getListingsLanguage xs + = foldr ((<|>) . toListingsLanguage) Nothing xs + +mbBraced :: Text -> Text +mbBraced x = if not (T.all isAlphaNum x) + then "{" <> x <> "}" + else x diff --git a/src/Text/Pandoc/Writers/Man.hs b/src/Text/Pandoc/Writers/Man.hs index 4eb0db042..87b2d8d21 100644 --- a/src/Text/Pandoc/Writers/Man.hs +++ b/src/Text/Pandoc/Writers/Man.hs @@ -3,7 +3,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Writers.Man - Copyright : Copyright (C) 2007-2020 John MacFarlane + Copyright : Copyright (C) 2007-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -16,6 +16,7 @@ Conversion of 'Pandoc' documents to roff man page format. module Text.Pandoc.Writers.Man ( writeMan ) where import Control.Monad.State.Strict import Data.List (intersperse) +import Data.List.NonEmpty (nonEmpty) import Data.Maybe (fromMaybe) import Data.Text (Text) import qualified Data.Text as T @@ -175,8 +176,7 @@ blockToMan opts (BulletList items) = do return (vcat contents) blockToMan opts (OrderedList attribs items) = do let markers = take (length items) $ orderedListMarkers attribs - let indent = 1 + - maximum (map T.length markers) + let indent = 1 + maybe 0 maximum (nonEmpty (map T.length markers)) contents <- mapM (\(num, item) -> orderedListItemToMan opts num indent item) $ zip markers items return (vcat contents) diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index 6aec6b244..fda2bbcef 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -5,7 +5,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Writers.Markdown - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -14,7 +14,7 @@ Conversion of 'Pandoc' documents to markdown-formatted plain text. -Markdown: <http://daringfireball.net/projects/markdown/> +Markdown: <https://daringfireball.net/projects/markdown/> -} module Text.Pandoc.Writers.Markdown ( writeMarkdown, @@ -22,15 +22,14 @@ module Text.Pandoc.Writers.Markdown ( writePlain) where import Control.Monad.Reader import Control.Monad.State.Strict -import Data.Char (isAlphaNum) import Data.Default -import Data.List (find, intersperse, sortOn, transpose) +import Data.List (intersperse, sortOn, transpose) +import Data.List.NonEmpty (nonEmpty, NonEmpty(..)) import qualified Data.Map as M -import Data.Maybe (fromMaybe, mapMaybe) +import Data.Maybe (fromMaybe, mapMaybe, isNothing) import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T -import Network.HTTP (urlEncode) import Text.HTML.TagSoup (Tag (..), isTagText, parseTags) import Text.Pandoc.Class.PandocMonad (PandocMonad, report) import Text.Pandoc.Definition @@ -44,59 +43,11 @@ import Text.Pandoc.Templates (renderTemplate) import Text.DocTemplates (Val(..), Context(..), FromContext(..)) import Text.Pandoc.Walk import Text.Pandoc.Writers.HTML (writeHtml5String) -import Text.Pandoc.Writers.Math (texMathToInlines) -import Text.Pandoc.XML (toHtml5Entities) -import Data.Coerce (coerce) - -type Notes = [[Block]] -type Ref = (Text, Target, Attr) -type Refs = [Ref] - -type MD m = ReaderT WriterEnv (StateT WriterState m) - -evalMD :: PandocMonad m => MD m a -> WriterEnv -> WriterState -> m a -evalMD md env st = evalStateT (runReaderT md env) st - -data WriterEnv = WriterEnv { envInList :: Bool - , envVariant :: MarkdownVariant - , envRefShortcutable :: Bool - , envBlockLevel :: Int - , envEscapeSpaces :: Bool - } - -data MarkdownVariant = - PlainText - | Commonmark - | Markdown - deriving (Show, Eq) - -instance Default WriterEnv - where def = WriterEnv { envInList = False - , envVariant = Markdown - , envRefShortcutable = True - , envBlockLevel = 0 - , envEscapeSpaces = False - } - -data WriterState = WriterState { stNotes :: Notes - , stPrevRefs :: Refs - , stRefs :: Refs - , stKeys :: M.Map Key - (M.Map (Target, Attr) Int) - , stLastIdx :: Int - , stIds :: Set.Set Text - , stNoteNum :: Int - } - -instance Default WriterState - where def = WriterState{ stNotes = [] - , stPrevRefs = [] - , stRefs = [] - , stKeys = M.empty - , stLastIdx = 0 - , stIds = Set.empty - , stNoteNum = 1 - } +import Text.Pandoc.Writers.Markdown.Inline (inlineListToMarkdown, linkAttributes, attrsToMarkdown) +import Text.Pandoc.Writers.Markdown.Types (MarkdownVariant(..), + WriterState(..), + WriterEnv(..), + Ref, Refs, MD, evalMD) -- | Convert Pandoc to Markdown. writeMarkdown :: PandocMonad m => WriterOptions -> Pandoc -> m Text @@ -116,7 +67,16 @@ writePlain opts document = -- | Convert Pandoc to Commonmark. writeCommonMark :: PandocMonad m => WriterOptions -> Pandoc -> m Text writeCommonMark opts document = - evalMD (pandocToMarkdown opts document) def{ envVariant = Commonmark } def + evalMD (pandocToMarkdown opts' document) def{ envVariant = Commonmark } def + where + opts' = opts{ writerExtensions = + -- These extensions can't be enabled or disabled + -- for commonmark because they're part of the core; + -- we set them here so that escapeText will behave + -- properly. + enableExtension Ext_all_symbols_escapable $ + enableExtension Ext_intraword_underscores $ + writerExtensions opts } pandocTitleBlock :: Doc Text -> [Doc Text] -> Doc Text -> Doc Text pandocTitleBlock tit auths dat = @@ -174,23 +134,24 @@ valToYaml :: Val Text -> Doc Text valToYaml (ListVal xs) = vcat $ map (\v -> hang 2 "- " (valToYaml v)) xs valToYaml (MapVal c) = contextToYaml c +valToYaml (BoolVal True) = "true" +valToYaml (BoolVal False) = "false" valToYaml (SimpleVal x) | isEmpty x = empty | otherwise = if hasNewlines x then hang 0 ("|" <> cr) x - else if fst $ foldr needsDoubleQuotes (False, True) x + else if isNothing $ foldM needsDoubleQuotes True x then "\"" <> fmap escapeInDoubleQuotes x <> "\"" else x where - needsDoubleQuotes t (positive, isFirst) + needsDoubleQuotes isFirst t = if T.any isBadAnywhere t || (isFirst && T.any isYamlPunct (T.take 1 t)) - then (True, False) - else (positive, False) + then Nothing + else Just False isBadAnywhere '#' = True isBadAnywhere ':' = True - isBadAnywhere '`' = False isBadAnywhere _ = False hasNewlines NewLine = True hasNewlines BlankLines{} = True @@ -295,75 +256,11 @@ noteToMarkdown opts num blocks = do then hang (writerTabStop opts) (marker <> spacer) contents else marker <> spacer <> contents --- | Escape special characters for Markdown. -escapeText :: WriterOptions -> Text -> Text -escapeText opts = T.pack . go . T.unpack - where - go [] = [] - go (c:cs) = - case c of - '<' | isEnabled Ext_all_symbols_escapable opts -> - '\\' : '<' : go cs - | otherwise -> "<" ++ go cs - '>' | isEnabled Ext_all_symbols_escapable opts -> - '\\' : '>' : go cs - | otherwise -> ">" ++ go cs - '@' | isEnabled Ext_citations opts -> - case cs of - (d:_) - | isAlphaNum d || d == '_' - -> '\\':'@':go cs - _ -> '@':go cs - _ | c `elem` ['\\','`','*','_','[',']','#'] -> - '\\':c:go cs - '|' | isEnabled Ext_pipe_tables opts -> '\\':'|':go cs - '^' | isEnabled Ext_superscript opts -> '\\':'^':go cs - '~' | isEnabled Ext_subscript opts || - isEnabled Ext_strikeout opts -> '\\':'~':go cs - '$' | isEnabled Ext_tex_math_dollars opts -> '\\':'$':go cs - '\'' | isEnabled Ext_smart opts -> '\\':'\'':go cs - '"' | isEnabled Ext_smart opts -> '\\':'"':go cs - '-' | isEnabled Ext_smart opts -> - case cs of - '-':_ -> '\\':'-':go cs - _ -> '-':go cs - '.' | isEnabled Ext_smart opts -> - case cs of - '.':'.':rest -> '\\':'.':'.':'.':go rest - _ -> '.':go cs - _ -> case cs of - '_':x:xs - | isEnabled Ext_intraword_underscores opts - , isAlphaNum c - , isAlphaNum x -> c : '_' : x : go xs - _ -> c : go cs - -attrsToMarkdown :: Attr -> Doc Text -attrsToMarkdown attribs = braces $ hsep [attribId, attribClasses, attribKeys] - where attribId = case attribs of - ("",_,_) -> empty - (i,_,_) -> "#" <> escAttr i - attribClasses = case attribs of - (_,[],_) -> empty - (_,cs,_) -> hsep $ - map (escAttr . ("."<>)) - cs - attribKeys = case attribs of - (_,_,[]) -> empty - (_,_,ks) -> hsep $ - map (\(k,v) -> escAttr k - <> "=\"" <> - escAttr v <> "\"") ks - escAttr = mconcat . map escAttrChar . T.unpack - escAttrChar '"' = literal "\\\"" - escAttrChar '\\' = literal "\\\\" - escAttrChar c = literal $ T.singleton c - -linkAttributes :: WriterOptions -> Attr -> Doc Text -linkAttributes opts attr = - if isEnabled Ext_link_attributes opts && attr /= nullAttr - then attrsToMarkdown attr - else empty +-- | (Code) blocks with a single class and no attributes can just use it +-- standalone, no need to bother with curly braces. +classOrAttrsToMarkdown :: Attr -> Doc Text +classOrAttrsToMarkdown ("",[cls],[]) = literal cls +classOrAttrsToMarkdown attrs = attrsToMarkdown attrs -- | Ordered list start parser for use in Para below. olMarker :: Parser Text ParserState () @@ -424,9 +321,12 @@ blockToMarkdown' opts (Div attrs ils) = do case () of _ | isEnabled Ext_fenced_divs opts && attrs /= nullAttr -> - nowrap (literal ":::" <+> attrsToMarkdown attrs) $$ - chomp contents $$ - literal ":::" <> blankline + let attrsToMd = if variant == Commonmark + then attrsToMarkdown + else classOrAttrsToMarkdown + in nowrap (literal ":::" <+> attrsToMd attrs) $$ + chomp contents $$ + literal ":::" <> blankline | isEnabled Ext_native_divs opts || (isEnabled Ext_raw_html opts && (variant == Commonmark || @@ -468,7 +368,7 @@ blockToMarkdown' opts (Plain inlines) = do -- title beginning with fig: indicates figure blockToMarkdown' opts (Para [Image attr alt (src,tgt@(T.stripPrefix "fig:" -> Just tit))]) | isEnabled Ext_raw_html opts && - not (isEnabled Ext_link_attributes opts) && + not (isEnabled Ext_link_attributes opts || isEnabled Ext_attributes opts) && attr /= nullAttr = -- use raw HTML (<> blankline) . literal . T.strip <$> writeHtml5String opts{ writerTemplate = Nothing } @@ -492,25 +392,24 @@ blockToMarkdown' opts b@(RawBlock f str) = do let renderEmpty = mempty <$ report (BlockNotRendered b) case variant of PlainText -> renderEmpty - _ | f `elem` ["markdown", "markdown_github", "markdown_phpextra", - "markdown_mmd", "markdown_strict"] -> - return $ literal str <> literal "\n" - | isEnabled Ext_raw_attribute opts -> rawAttribBlock - | f `elem` ["html", "html5", "html4"] -> - case () of - _ | isEnabled Ext_markdown_attribute opts -> return $ - literal (addMarkdownAttribute str) <> literal "\n" - | isEnabled Ext_raw_html opts -> return $ - literal str <> literal "\n" - | isEnabled Ext_raw_attribute opts -> rawAttribBlock - | otherwise -> renderEmpty - | f `elem` ["latex", "tex"] -> - case () of - _ | isEnabled Ext_raw_tex opts -> return $ - literal str <> literal "\n" - | isEnabled Ext_raw_attribute opts -> rawAttribBlock - | otherwise -> renderEmpty - | otherwise -> renderEmpty + Commonmark + | f `elem` ["gfm", "commonmark", "commonmark_x", "markdown"] + -> return $ literal str <> literal "\n" + Markdown + | f `elem` ["markdown", "markdown_github", "markdown_phpextra", + "markdown_mmd", "markdown_strict"] + -> return $ literal str <> literal "\n" + _ | isEnabled Ext_raw_attribute opts -> rawAttribBlock + | f `elem` ["html", "html5", "html4"] + , isEnabled Ext_markdown_attribute opts + -> return $ literal (addMarkdownAttribute str) <> literal "\n" + | f `elem` ["html", "html5", "html4"] + , isEnabled Ext_raw_html opts + -> return $ literal str <> literal "\n" + | f `elem` ["latex", "tex"] + , isEnabled Ext_raw_tex opts + -> return $ literal str <> literal "\n" + _ -> renderEmpty blockToMarkdown' opts HorizontalRule = return $ blankline <> literal (T.replicate (writerColumns opts) "-") <> blankline blockToMarkdown' opts (Header level attr inlines) = do @@ -534,7 +433,8 @@ blockToMarkdown' opts (Header level attr inlines) = do && id' == autoId -> empty (id',_,_) | isEnabled Ext_mmd_header_identifiers opts -> space <> brackets (literal id') - _ | isEnabled Ext_header_attributes opts -> + _ | isEnabled Ext_header_attributes opts || + isEnabled Ext_attributes opts -> space <> attrsToMarkdown attr | otherwise -> empty contents <- inlineListToMarkdown opts $ @@ -584,19 +484,21 @@ blockToMarkdown' opts (CodeBlock attribs str) = do | isEnabled Ext_fenced_code_blocks opts -> tildes <> attrs <> cr <> literal str <> cr <> tildes <> blankline _ -> nest (writerTabStop opts) (literal str) <> blankline - where endline c = literal $ case [T.length ln - | ln <- map trim (T.lines str) - , T.pack [c,c,c] `T.isPrefixOf` ln - , T.all (== c) ln] of - [] -> T.replicate 3 $ T.singleton c - xs -> T.replicate (maximum xs + 1) $ T.singleton c - backticks = endline '`' - tildes = endline '~' - attrs = if isEnabled Ext_fenced_code_attributes opts - then nowrap $ " " <> attrsToMarkdown attribs - else case attribs of - (_,cls:_,_) -> " " <> literal cls - _ -> empty + where + endlineLen c = maybe 3 ((+1) . maximum) $ nonEmpty $ + [T.length ln + | ln <- map trim (T.lines str) + , T.pack [c,c,c] `T.isPrefixOf` ln + , T.all (== c) ln] + endline c = literal $ T.replicate (endlineLen c) $ T.singleton c + backticks = endline '`' + tildes = endline '~' + attrs = if isEnabled Ext_fenced_code_attributes opts || + isEnabled Ext_attributes opts + then nowrap $ " " <> classOrAttrsToMarkdown attribs + else case attribs of + (_,cls:_,_) -> " " <> literal cls + _ -> empty blockToMarkdown' opts (BlockQuote blocks) = do variant <- asks envVariant -- if we're writing literate haskell, put a space before the bird tracks @@ -609,7 +511,7 @@ blockToMarkdown' opts (BlockQuote blocks) = do return $ prefixed leader contents <> blankline blockToMarkdown' opts t@(Table _ blkCapt specs thead tbody tfoot) = do let (caption, aligns, widths, headers, rows) = toLegacyTable blkCapt specs thead tbody tfoot - let numcols = maximum (length aligns : length widths : + let numcols = maximum (length aligns :| length widths : map length (headers:rows)) caption' <- inlineListToMarkdown opts caption let caption'' @@ -672,7 +574,10 @@ blockToMarkdown' opts (BulletList items) = do contents <- inList $ mapM (bulletListItemToMarkdown opts) items return $ (if isTightList items then vcat else vsep) contents <> blankline blockToMarkdown' opts (OrderedList (start,sty,delim) items) = do - let start' = if isEnabled Ext_startnum opts then start else 1 + variant <- asks envVariant + let start' = if variant == Commonmark || isEnabled Ext_startnum opts + then start + else 1 let sty' = if isEnabled Ext_fancy_lists opts then sty else DefaultStyle let delim' = if isEnabled Ext_fancy_lists opts then delim else DefaultDelim let attribs = (start', sty', delim') @@ -708,7 +613,8 @@ pipeTable headless aligns rawHeaders rawRows = do blockFor AlignCenter x y = cblock (x + 2) (sp <> y <> sp) <> lblock 0 empty blockFor AlignRight x y = rblock (x + 2) (y <> sp) <> lblock 0 empty blockFor _ x y = lblock (x + 2) (sp <> y) <> lblock 0 empty - let widths = map (max 3 . maximum . map offset) $ transpose (rawHeaders : rawRows) + let widths = map (max 3 . maybe 3 maximum . nonEmpty . map offset) $ + transpose (rawHeaders : rawRows) let torow cs = nowrap $ literal "|" <> hcat (intersperse (literal "|") $ zipWith3 blockFor aligns widths (map chomp cs)) @@ -742,11 +648,11 @@ pandocTable opts multiline headless aligns widths rawHeaders rawRows = do -- Number of characters per column necessary to output every cell -- without requiring a line break. -- The @+2@ is needed for specifying the alignment. - let numChars = (+ 2) . maximum . map offset + let numChars = (+ 2) . maybe 0 maximum . nonEmpty . map offset -- Number of characters per column necessary to output every cell -- without requiring a line break *inside a word*. -- The @+2@ is needed for specifying the alignment. - let minNumChars = (+ 2) . maximum . map minOffset + let minNumChars = (+ 2) . maybe 0 maximum . nonEmpty . map minOffset let columns = transpose (rawHeaders : rawRows) -- minimal column width without wrapping a single word let relWidth w col = @@ -885,6 +791,9 @@ blockListToMarkdown opts blocks = do b1 : commentSep : fixBlocks (b2:bs) fixBlocks (Plain ils : bs@(RawBlock{}:_)) = Plain ils : fixBlocks bs + fixBlocks (Plain ils : bs@(Div{}:_)) + | isEnabled Ext_fenced_divs opts = + Para ils : fixBlocks bs fixBlocks (Plain ils : bs) | inlist = Plain ils : fixBlocks bs fixBlocks (Plain ils : bs) = @@ -908,488 +817,7 @@ blockListToMarkdown opts blocks = do | otherwise = RawBlock "markdown" " \n" mconcat <$> mapM (blockToMarkdown opts) (fixBlocks blocks) -getKey :: Doc Text -> Key -getKey = toKey . render Nothing - -findUsableIndex :: [Text] -> Int -> Int -findUsableIndex lbls i = if tshow i `elem` lbls - then findUsableIndex lbls (i + 1) - else i - -getNextIndex :: PandocMonad m => MD m Int -getNextIndex = do - prevRefs <- gets stPrevRefs - refs <- gets stRefs - i <- (+ 1) <$> gets stLastIdx - modify $ \s -> s{ stLastIdx = i } - let refLbls = map (\(r,_,_) -> r) $ prevRefs ++ refs - return $ findUsableIndex refLbls i - --- | Get reference for target; if none exists, create unique one and return. --- Prefer label if possible; otherwise, generate a unique key. -getReference :: PandocMonad m => Attr -> Doc Text -> Target -> MD m Text -getReference attr label target = do - refs <- gets stRefs - case find (\(_,t,a) -> t == target && a == attr) refs of - Just (ref, _, _) -> return ref - Nothing -> do - keys <- gets stKeys - let key = getKey label - let rawkey = coerce key - case M.lookup key keys of - Nothing -> do -- no other refs with this label - (lab', idx) <- if T.null rawkey || - T.length rawkey > 999 || - T.any (\c -> c == '[' || c == ']') rawkey - then do - i <- getNextIndex - return (tshow i, i) - else - return (render Nothing label, 0) - modify (\s -> s{ - stRefs = (lab', target, attr) : refs, - stKeys = M.insert (getKey label) - (M.insert (target, attr) idx mempty) - (stKeys s) }) - return lab' - - Just km -> -- we have refs with this label - case M.lookup (target, attr) km of - Just i -> do - let lab' = render Nothing $ - label <> if i == 0 - then mempty - else literal (tshow i) - -- make sure it's in stRefs; it may be - -- a duplicate that was printed in a previous - -- block: - when ((lab', target, attr) `notElem` refs) $ - modify (\s -> s{ - stRefs = (lab', target, attr) : refs }) - return lab' - Nothing -> do -- but this one is to a new target - i <- getNextIndex - let lab' = tshow i - modify (\s -> s{ - stRefs = (lab', target, attr) : refs, - stKeys = M.insert key - (M.insert (target, attr) i km) - (stKeys s) }) - return lab' - --- | Convert list of Pandoc inline elements to markdown. -inlineListToMarkdown :: PandocMonad m => WriterOptions -> [Inline] -> MD m (Doc Text) -inlineListToMarkdown opts lst = do - inlist <- asks envInList - go (if inlist then avoidBadWrapsInList lst else lst) - where go [] = return empty - go (i:is) = case i of - Link {} -> case is of - -- If a link is followed by another link, or '[', '(' or ':' - -- then we don't shortcut - Link {}:_ -> unshortcutable - Space:Link {}:_ -> unshortcutable - Space:(Str(thead -> Just '[')):_ -> unshortcutable - Space:(RawInline _ (thead -> Just '[')):_ -> unshortcutable - Space:(Cite _ _):_ -> unshortcutable - SoftBreak:Link {}:_ -> unshortcutable - SoftBreak:(Str(thead -> Just '[')):_ -> unshortcutable - SoftBreak:(RawInline _ (thead -> Just '[')):_ -> unshortcutable - SoftBreak:(Cite _ _):_ -> unshortcutable - LineBreak:Link {}:_ -> unshortcutable - LineBreak:(Str(thead -> Just '[')):_ -> unshortcutable - LineBreak:(RawInline _ (thead -> Just '[')):_ -> unshortcutable - LineBreak:(Cite _ _):_ -> unshortcutable - (Cite _ _):_ -> unshortcutable - Str (thead -> Just '['):_ -> unshortcutable - Str (thead -> Just '('):_ -> unshortcutable - Str (thead -> Just ':'):_ -> unshortcutable - (RawInline _ (thead -> Just '[')):_ -> unshortcutable - (RawInline _ (thead -> Just '(')):_ -> unshortcutable - (RawInline _ (thead -> Just ':')):_ -> unshortcutable - (RawInline _ (T.stripPrefix " [" -> Just _ )):_ -> unshortcutable - _ -> shortcutable - _ -> shortcutable - where shortcutable = liftM2 (<>) (inlineToMarkdown opts i) (go is) - unshortcutable = do - iMark <- local - (\env -> env { envRefShortcutable = False }) - (inlineToMarkdown opts i) - fmap (iMark <>) (go is) - thead = fmap fst . T.uncons - -isSp :: Inline -> Bool -isSp Space = True -isSp SoftBreak = True -isSp _ = False - -avoidBadWrapsInList :: [Inline] -> [Inline] -avoidBadWrapsInList [] = [] -avoidBadWrapsInList (s:Str (T.uncons -> Just ('>',cs)):xs) | isSp s = - Str (" >" <> cs) : avoidBadWrapsInList xs -avoidBadWrapsInList [s, Str (T.uncons -> Just (c, cs))] - | T.null cs && isSp s && c `elem` ['-','*','+'] = [Str $ T.pack [' ', c]] -avoidBadWrapsInList (s:Str (T.uncons -> Just (c, cs)):Space:xs) - | T.null cs && isSp s && c `elem` ['-','*','+'] = - Str (T.pack [' ', c]) : Space : avoidBadWrapsInList xs -avoidBadWrapsInList (s:Str cs:Space:xs) - | isSp s && isOrderedListMarker cs = - Str (" " <> cs) : Space : avoidBadWrapsInList xs -avoidBadWrapsInList [s, Str cs] - | isSp s && isOrderedListMarker cs = [Str $ " " <> cs] -avoidBadWrapsInList (x:xs) = x : avoidBadWrapsInList xs - -isOrderedListMarker :: Text -> Bool -isOrderedListMarker xs = not (T.null xs) && (T.last xs `elem` ['.',')']) && - isRight (runParser (anyOrderedListMarker >> eof) - defaultParserState "" xs) - -isRight :: Either a b -> Bool -isRight (Right _) = True -isRight (Left _) = False - --- | Convert Pandoc inline element to markdown. -inlineToMarkdown :: PandocMonad m => WriterOptions -> Inline -> MD m (Doc Text) -inlineToMarkdown opts (Span ("",["emoji"],kvs) [Str s]) = - case lookup "data-emoji" kvs of - Just emojiname | isEnabled Ext_emoji opts -> - return $ ":" <> literal emojiname <> ":" - _ -> inlineToMarkdown opts (Str s) -inlineToMarkdown opts (Span attrs ils) = do - variant <- asks envVariant - contents <- inlineListToMarkdown opts ils - return $ case variant of - PlainText -> contents - _ | attrs == nullAttr -> contents - | isEnabled Ext_bracketed_spans opts -> - let attrs' = if attrs /= nullAttr - then attrsToMarkdown attrs - else empty - in "[" <> contents <> "]" <> attrs' - | isEnabled Ext_raw_html opts || - isEnabled Ext_native_spans opts -> - tagWithAttrs "span" attrs <> contents <> literal "</span>" - | otherwise -> contents -inlineToMarkdown _ (Emph []) = return empty -inlineToMarkdown opts (Emph lst) = do - variant <- asks envVariant - contents <- inlineListToMarkdown opts lst - return $ case variant of - PlainText - | isEnabled Ext_gutenberg opts -> "_" <> contents <> "_" - | otherwise -> contents - _ -> "*" <> contents <> "*" -inlineToMarkdown _ (Underline []) = return empty -inlineToMarkdown opts (Underline lst) = do - variant <- asks envVariant - contents <- inlineListToMarkdown opts lst - case variant of - PlainText -> return contents - _ | isEnabled Ext_bracketed_spans opts -> - return $ "[" <> contents <> "]" <> "{.ul}" - | isEnabled Ext_native_spans opts -> - return $ tagWithAttrs "span" ("", ["underline"], []) - <> contents - <> literal "</span>" - | isEnabled Ext_raw_html opts -> - return $ "<u>" <> contents <> "</u>" - | otherwise -> inlineToMarkdown opts (Emph lst) -inlineToMarkdown _ (Strong []) = return empty -inlineToMarkdown opts (Strong lst) = do - variant <- asks envVariant - case variant of - PlainText -> - inlineListToMarkdown opts $ - if isEnabled Ext_gutenberg opts - then capitalize lst - else lst - _ -> do - contents <- inlineListToMarkdown opts lst - return $ "**" <> contents <> "**" -inlineToMarkdown _ (Strikeout []) = return empty -inlineToMarkdown opts (Strikeout lst) = do - contents <- inlineListToMarkdown opts lst - return $ if isEnabled Ext_strikeout opts - then "~~" <> contents <> "~~" - else if isEnabled Ext_raw_html opts - then "<s>" <> contents <> "</s>" - else contents -inlineToMarkdown _ (Superscript []) = return empty -inlineToMarkdown opts (Superscript lst) = - local (\env -> env {envEscapeSpaces = envVariant env == Markdown}) $ do - contents <- inlineListToMarkdown opts lst - if isEnabled Ext_superscript opts - then return $ "^" <> contents <> "^" - else if isEnabled Ext_raw_html opts - then return $ "<sup>" <> contents <> "</sup>" - else - case traverse toSuperscriptInline lst of - Just xs' | not (writerPreferAscii opts) - -> inlineListToMarkdown opts xs' - _ -> do - let rendered = render Nothing contents - return $ - case mapM toSuperscript (T.unpack rendered) of - Just r -> literal $ T.pack r - Nothing -> literal $ "^(" <> rendered <> ")" -inlineToMarkdown _ (Subscript []) = return empty -inlineToMarkdown opts (Subscript lst) = - local (\env -> env {envEscapeSpaces = envVariant env == Markdown}) $ do - contents <- inlineListToMarkdown opts lst - if isEnabled Ext_subscript opts - then return $ "~" <> contents <> "~" - else if isEnabled Ext_raw_html opts - then return $ "<sub>" <> contents <> "</sub>" - else - case traverse toSubscriptInline lst of - Just xs' | not (writerPreferAscii opts) - -> inlineListToMarkdown opts xs' - _ -> do - let rendered = render Nothing contents - return $ - case mapM toSuperscript (T.unpack rendered) of - Just r -> literal $ T.pack r - Nothing -> literal $ "_(" <> rendered <> ")" -inlineToMarkdown opts (SmallCaps lst) = do - variant <- asks envVariant - if variant /= PlainText && - (isEnabled Ext_raw_html opts || isEnabled Ext_native_spans opts) - then inlineToMarkdown opts (Span ("",["smallcaps"],[]) lst) - else inlineListToMarkdown opts $ capitalize lst -inlineToMarkdown opts (Quoted SingleQuote lst) = do - contents <- inlineListToMarkdown opts lst - return $ if isEnabled Ext_smart opts - then "'" <> contents <> "'" - else - if writerPreferAscii opts - then "‘" <> contents <> "’" - else "‘" <> contents <> "’" -inlineToMarkdown opts (Quoted DoubleQuote lst) = do - contents <- inlineListToMarkdown opts lst - return $ if isEnabled Ext_smart opts - then "\"" <> contents <> "\"" - else - if writerPreferAscii opts - then "“" <> contents <> "”" - else "“" <> contents <> "”" -inlineToMarkdown opts (Code attr str) = do - let tickGroups = filter (T.any (== '`')) $ T.group str - let longest = if null tickGroups - then 0 - else maximum $ map T.length tickGroups - let marker = T.replicate (longest + 1) "`" - let spacer = if longest == 0 then "" else " " - let attrs = if isEnabled Ext_inline_code_attributes opts && attr /= nullAttr - then attrsToMarkdown attr - else empty - variant <- asks envVariant - case variant of - PlainText -> return $ literal str - _ -> return $ literal - (marker <> spacer <> str <> spacer <> marker) <> attrs -inlineToMarkdown opts (Str str) = do - variant <- asks envVariant - let str' = (if writerPreferAscii opts - then toHtml5Entities - else id) . - (if isEnabled Ext_smart opts - then unsmartify opts - else id) . - (if variant == PlainText - then id - else escapeText opts) $ str - return $ literal str' -inlineToMarkdown opts (Math InlineMath str) = - case writerHTMLMathMethod opts of - WebTeX url -> inlineToMarkdown opts - (Image nullAttr [Str str] (url <> T.pack (urlEncode $ T.unpack str), str)) - _ | isEnabled Ext_tex_math_dollars opts -> - return $ "$" <> literal str <> "$" - | isEnabled Ext_tex_math_single_backslash opts -> - return $ "\\(" <> literal str <> "\\)" - | isEnabled Ext_tex_math_double_backslash opts -> - return $ "\\\\(" <> literal str <> "\\\\)" - | otherwise -> do - variant <- asks envVariant - texMathToInlines InlineMath str >>= - inlineListToMarkdown opts . - (if variant == PlainText then makeMathPlainer else id) -inlineToMarkdown opts (Math DisplayMath str) = - case writerHTMLMathMethod opts of - WebTeX url -> (\x -> blankline <> x <> blankline) `fmap` - inlineToMarkdown opts (Image nullAttr [Str str] - (url <> T.pack (urlEncode $ T.unpack str), str)) - _ | isEnabled Ext_tex_math_dollars opts -> - return $ "$$" <> literal str <> "$$" - | isEnabled Ext_tex_math_single_backslash opts -> - return $ "\\[" <> literal str <> "\\]" - | isEnabled Ext_tex_math_double_backslash opts -> - return $ "\\\\[" <> literal str <> "\\\\]" - | otherwise -> (\x -> cr <> x <> cr) `fmap` - (texMathToInlines DisplayMath str >>= inlineListToMarkdown opts) -inlineToMarkdown opts il@(RawInline f str) = do - let tickGroups = filter (T.any (== '`')) $ T.group str - let numticks = if null tickGroups - then 1 - else 1 + maximum (map T.length tickGroups) - variant <- asks envVariant - let Format fmt = f - let rawAttribInline = return $ - literal (T.replicate numticks "`") <> literal str <> - literal (T.replicate numticks "`") <> literal "{=" <> literal fmt <> literal "}" - let renderEmpty = mempty <$ report (InlineNotRendered il) - case variant of - PlainText -> renderEmpty - _ | f `elem` ["markdown", "markdown_github", "markdown_phpextra", - "markdown_mmd", "markdown_strict"] -> - return $ literal str - | isEnabled Ext_raw_attribute opts -> rawAttribInline - | f `elem` ["html", "html5", "html4"] -> - case () of - _ | isEnabled Ext_raw_html opts -> return $ literal str - | isEnabled Ext_raw_attribute opts -> rawAttribInline - | otherwise -> renderEmpty - | f `elem` ["latex", "tex"] -> - case () of - _ | isEnabled Ext_raw_tex opts -> return $ literal str - | isEnabled Ext_raw_attribute opts -> rawAttribInline - | otherwise -> renderEmpty - | otherwise -> renderEmpty -inlineToMarkdown opts LineBreak = do - variant <- asks envVariant - if variant == PlainText || isEnabled Ext_hard_line_breaks opts - then return cr - else return $ - if isEnabled Ext_escaped_line_breaks opts - then "\\" <> cr - else " " <> cr -inlineToMarkdown _ Space = do - escapeSpaces <- asks envEscapeSpaces - return $ if escapeSpaces then "\\ " else space -inlineToMarkdown opts SoftBreak = do - escapeSpaces <- asks envEscapeSpaces - let space' = if escapeSpaces then "\\ " else space - return $ case writerWrapText opts of - WrapNone -> space' - WrapAuto -> space' - WrapPreserve -> cr -inlineToMarkdown opts (Cite [] lst) = inlineListToMarkdown opts lst -inlineToMarkdown opts (Cite (c:cs) lst) - | not (isEnabled Ext_citations opts) = inlineListToMarkdown opts lst - | otherwise = - if citationMode c == AuthorInText - then do - suffs <- inlineListToMarkdown opts $ citationSuffix c - rest <- mapM convertOne cs - let inbr = suffs <+> joincits rest - br = if isEmpty inbr then empty else char '[' <> inbr <> char ']' - return $ literal ("@" <> citationId c) <+> br - else do - cits <- mapM convertOne (c:cs) - return $ literal "[" <> joincits cits <> literal "]" - where - joincits = hcat . intersperse (literal "; ") . filter (not . isEmpty) - convertOne Citation { citationId = k - , citationPrefix = pinlines - , citationSuffix = sinlines - , citationMode = m } - = do - pdoc <- inlineListToMarkdown opts pinlines - sdoc <- inlineListToMarkdown opts sinlines - let k' = literal (modekey m <> "@" <> k) - r = case sinlines of - Str (T.uncons -> Just (y,_)):_ | y `elem` (",;]@" :: String) -> k' <> sdoc - _ -> k' <+> sdoc - return $ pdoc <+> r - modekey SuppressAuthor = "-" - modekey _ = "" -inlineToMarkdown opts lnk@(Link attr txt (src, tit)) = do - variant <- asks envVariant - linktext <- inlineListToMarkdown opts txt - let linktitle = if T.null tit - then empty - else literal $ " \"" <> tit <> "\"" - let srcSuffix = fromMaybe src (T.stripPrefix "mailto:" src) - let useAuto = isURI src && - case txt of - [Str s] | escapeURI s == srcSuffix -> True - _ -> False - let useRefLinks = writerReferenceLinks opts && not useAuto - shortcutable <- asks envRefShortcutable - let useShortcutRefLinks = shortcutable && - isEnabled Ext_shortcut_reference_links opts - reftext <- if useRefLinks - then literal <$> getReference attr linktext (src, tit) - else return mempty - case variant of - PlainText - | useAuto -> return $ literal srcSuffix - | otherwise -> return linktext - _ | useAuto -> return $ "<" <> literal srcSuffix <> ">" - | useRefLinks -> - let first = "[" <> linktext <> "]" - second = if getKey linktext == getKey reftext - then if useShortcutRefLinks - then "" - else "[]" - else "[" <> reftext <> "]" - in return $ first <> second - | isEnabled Ext_raw_html opts - , not (isEnabled Ext_link_attributes opts) - , attr /= nullAttr -> -- use raw HTML to render attributes - literal . T.strip <$> - writeHtml5String opts{ writerTemplate = Nothing } - (Pandoc nullMeta [Plain [lnk]]) - | otherwise -> return $ - "[" <> linktext <> "](" <> literal src <> linktitle <> ")" <> - linkAttributes opts attr -inlineToMarkdown opts img@(Image attr alternate (source, tit)) - | isEnabled Ext_raw_html opts && - not (isEnabled Ext_link_attributes opts) && - attr /= nullAttr = -- use raw HTML - literal . T.strip <$> - writeHtml5String opts{ writerTemplate = Nothing } (Pandoc nullMeta [Plain [img]]) - | otherwise = do - variant <- asks envVariant - let txt = if null alternate || alternate == [Str source] - -- to prevent autolinks - then [Str ""] - else alternate - linkPart <- inlineToMarkdown opts (Link attr txt (source, tit)) - return $ case variant of - PlainText -> "[" <> linkPart <> "]" - _ -> "!" <> linkPart -inlineToMarkdown opts (Note contents) = do - modify (\st -> st{ stNotes = contents : stNotes st }) - st <- get - let ref = literal $ writerIdentifierPrefix opts <> tshow (stNoteNum st + length (stNotes st) - 1) - if isEnabled Ext_footnotes opts - then return $ "[^" <> ref <> "]" - else return $ "[" <> ref <> "]" - -makeMathPlainer :: [Inline] -> [Inline] -makeMathPlainer = walk go - where - go (Emph xs) = Span nullAttr xs - go x = x - lineBreakToSpace :: Inline -> Inline lineBreakToSpace LineBreak = Space lineBreakToSpace SoftBreak = Space lineBreakToSpace x = x - -toSubscriptInline :: Inline -> Maybe Inline -toSubscriptInline Space = Just Space -toSubscriptInline (Span attr ils) = Span attr <$> traverse toSubscriptInline ils -toSubscriptInline (Str s) = Str . T.pack <$> traverse toSubscript (T.unpack s) -toSubscriptInline LineBreak = Just LineBreak -toSubscriptInline SoftBreak = Just SoftBreak -toSubscriptInline _ = Nothing - -toSuperscriptInline :: Inline -> Maybe Inline -toSuperscriptInline Space = Just Space -toSuperscriptInline (Span attr ils) = Span attr <$> traverse toSuperscriptInline ils -toSuperscriptInline (Str s) = Str . T.pack <$> traverse toSuperscript (T.unpack s) -toSuperscriptInline LineBreak = Just LineBreak -toSuperscriptInline SoftBreak = Just SoftBreak -toSuperscriptInline _ = Nothing diff --git a/src/Text/Pandoc/Writers/Markdown/Inline.hs b/src/Text/Pandoc/Writers/Markdown/Inline.hs new file mode 100644 index 000000000..cd5f5b896 --- /dev/null +++ b/src/Text/Pandoc/Writers/Markdown/Inline.hs @@ -0,0 +1,616 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE ViewPatterns #-} +{- | + Module : Text.Pandoc.Writers.Markdown.Inline + Copyright : Copyright (C) 2006-2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable +-} +module Text.Pandoc.Writers.Markdown.Inline ( + inlineListToMarkdown, + linkAttributes, + attrsToMarkdown + ) where +import Control.Monad.Reader +import Control.Monad.State.Strict +import Data.Char (isAlphaNum, isDigit) +import Data.List (find, intersperse) +import Data.List.NonEmpty (nonEmpty) +import qualified Data.Map as M +import Data.Maybe (fromMaybe) +import Data.Text (Text) +import qualified Data.Text as T +import Network.HTTP (urlEncode) +import Text.Pandoc.Class.PandocMonad (PandocMonad, report) +import Text.Pandoc.Definition +import Text.Pandoc.Logging +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (blankline, blanklines, char, space) +import Text.DocLayout +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Walk +import Text.Pandoc.Writers.HTML (writeHtml5String) +import Text.Pandoc.Writers.Math (texMathToInlines) +import Text.Pandoc.XML (toHtml5Entities) +import Data.Coerce (coerce) +import Text.Pandoc.Writers.Markdown.Types (MarkdownVariant(..), + WriterState(..), + WriterEnv(..), MD) + +-- | Escape special characters for Markdown. +escapeText :: WriterOptions -> Text -> Text +escapeText opts = T.pack . go . T.unpack + where + startsWithSpace (' ':_) = True + startsWithSpace ('\t':_) = True + startsWithSpace [] = True + startsWithSpace _ = False + go [] = [] + go (c:cs) = + case c of + '<' | isEnabled Ext_all_symbols_escapable opts -> + '\\' : '<' : go cs + | otherwise -> "<" ++ go cs + '>' | isEnabled Ext_all_symbols_escapable opts -> + '\\' : '>' : go cs + | otherwise -> ">" ++ go cs + '@' | isEnabled Ext_citations opts -> + case cs of + (d:_) + | isAlphaNum d || d == '_' || d == '{' + -> '\\':'@':go cs + _ -> '@':go cs + '#' | isEnabled Ext_space_in_atx_header opts + , startsWithSpace cs + -> '\\':'#':go cs + _ | c `elem` ['\\','`','*','_','[',']'] -> + '\\':c:go cs + '|' | isEnabled Ext_pipe_tables opts -> '\\':'|':go cs + '^' | isEnabled Ext_superscript opts -> '\\':'^':go cs + '~' | isEnabled Ext_subscript opts || + isEnabled Ext_strikeout opts -> '\\':'~':go cs + '$' | isEnabled Ext_tex_math_dollars opts -> '\\':'$':go cs + '\'' | isEnabled Ext_smart opts -> '\\':'\'':go cs + '"' | isEnabled Ext_smart opts -> '\\':'"':go cs + '-' | isEnabled Ext_smart opts -> + case cs of + '-':_ -> '\\':'-':go cs + _ -> '-':go cs + '.' | isEnabled Ext_smart opts -> + case cs of + '.':'.':rest -> '\\':'.':'.':'.':go rest + _ -> '.':go cs + _ -> case cs of + '_':x:xs + | isEnabled Ext_intraword_underscores opts + , isAlphaNum c + , isAlphaNum x -> c : '_' : x : go xs + '#':xs -> c : '#' : go xs + '>':xs -> c : '>' : go xs + _ -> c : go cs + +attrsToMarkdown :: Attr -> Doc Text +attrsToMarkdown attribs = braces $ hsep [attribId, attribClasses, attribKeys] + where attribId = case attribs of + ("",_,_) -> empty + (i,_,_) -> "#" <> escAttr i + attribClasses = case attribs of + (_,[],_) -> empty + (_,cs,_) -> hsep $ + map (escAttr . ("."<>)) + cs + attribKeys = case attribs of + (_,_,[]) -> empty + (_,_,ks) -> hsep $ + map (\(k,v) -> escAttr k + <> "=\"" <> + escAttr v <> "\"") ks + escAttr = mconcat . map escAttrChar . T.unpack + escAttrChar '"' = literal "\\\"" + escAttrChar '\\' = literal "\\\\" + escAttrChar c = literal $ T.singleton c + +linkAttributes :: WriterOptions -> Attr -> Doc Text +linkAttributes opts attr = + if (isEnabled Ext_link_attributes opts || isEnabled Ext_attributes opts) && attr /= nullAttr + then attrsToMarkdown attr + else empty + +getKey :: Doc Text -> Key +getKey = toKey . render Nothing + +findUsableIndex :: [Text] -> Int -> Int +findUsableIndex lbls i = if tshow i `elem` lbls + then findUsableIndex lbls (i + 1) + else i + +getNextIndex :: PandocMonad m => MD m Int +getNextIndex = do + prevRefs <- gets stPrevRefs + refs <- gets stRefs + i <- (+ 1) <$> gets stLastIdx + modify $ \s -> s{ stLastIdx = i } + let refLbls = map (\(r,_,_) -> r) $ prevRefs ++ refs + return $ findUsableIndex refLbls i + +-- | Get reference for target; if none exists, create unique one and return. +-- Prefer label if possible; otherwise, generate a unique key. +getReference :: PandocMonad m => Attr -> Doc Text -> Target -> MD m Text +getReference attr label target = do + refs <- gets stRefs + case find (\(_,t,a) -> t == target && a == attr) refs of + Just (ref, _, _) -> return ref + Nothing -> do + keys <- gets stKeys + let key = getKey label + let rawkey = coerce key + case M.lookup key keys of + Nothing -> do -- no other refs with this label + (lab', idx) <- if T.null rawkey || + T.length rawkey > 999 || + T.any (\c -> c == '[' || c == ']') rawkey + then do + i <- getNextIndex + return (tshow i, i) + else + return (render Nothing label, 0) + modify (\s -> s{ + stRefs = (lab', target, attr) : refs, + stKeys = M.insert (getKey label) + (M.insert (target, attr) idx mempty) + (stKeys s) }) + return lab' + + Just km -> -- we have refs with this label + case M.lookup (target, attr) km of + Just i -> do + let lab' = render Nothing $ + label <> if i == 0 + then mempty + else literal (tshow i) + -- make sure it's in stRefs; it may be + -- a duplicate that was printed in a previous + -- block: + when ((lab', target, attr) `notElem` refs) $ + modify (\s -> s{ + stRefs = (lab', target, attr) : refs }) + return lab' + Nothing -> do -- but this one is to a new target + i <- getNextIndex + let lab' = tshow i + modify (\s -> s{ + stRefs = (lab', target, attr) : refs, + stKeys = M.insert key + (M.insert (target, attr) i km) + (stKeys s) }) + return lab' + +-- | Convert list of Pandoc inline elements to markdown. +inlineListToMarkdown :: PandocMonad m => WriterOptions -> [Inline] -> MD m (Doc Text) +inlineListToMarkdown opts lst = do + inlist <- asks envInList + go (if inlist then avoidBadWrapsInList lst else lst) + where go [] = return empty + go (x@Math{}:y@(Str t):zs) + | T.all isDigit (T.take 1 t) -- starts with digit -- see #7058 + = liftM2 (<>) (inlineToMarkdown opts x) + (go (RawInline (Format "html") "<!-- -->" : y : zs)) + go (i:is) = case i of + Link {} -> case is of + -- If a link is followed by another link, or '[', '(' or ':' + -- then we don't shortcut + Link {}:_ -> unshortcutable + Space:Link {}:_ -> unshortcutable + Space:(Str(thead -> Just '[')):_ -> unshortcutable + Space:(RawInline _ (thead -> Just '[')):_ -> unshortcutable + Space:(Cite _ _):_ -> unshortcutable + SoftBreak:Link {}:_ -> unshortcutable + SoftBreak:(Str(thead -> Just '[')):_ -> unshortcutable + SoftBreak:(RawInline _ (thead -> Just '[')):_ -> unshortcutable + SoftBreak:(Cite _ _):_ -> unshortcutable + LineBreak:Link {}:_ -> unshortcutable + LineBreak:(Str(thead -> Just '[')):_ -> unshortcutable + LineBreak:(RawInline _ (thead -> Just '[')):_ -> unshortcutable + LineBreak:(Cite _ _):_ -> unshortcutable + (Cite _ _):_ -> unshortcutable + Str (thead -> Just '['):_ -> unshortcutable + Str (thead -> Just '('):_ -> unshortcutable + Str (thead -> Just ':'):_ -> unshortcutable + (RawInline _ (thead -> Just '[')):_ -> unshortcutable + (RawInline _ (thead -> Just '(')):_ -> unshortcutable + (RawInline _ (thead -> Just ':')):_ -> unshortcutable + (RawInline _ (T.stripPrefix " [" -> Just _ )):_ -> unshortcutable + _ -> shortcutable + _ -> shortcutable + where shortcutable = liftM2 (<>) (inlineToMarkdown opts i) (go is) + unshortcutable = do + iMark <- local + (\env -> env { envRefShortcutable = False }) + (inlineToMarkdown opts i) + fmap (iMark <>) (go is) + thead = fmap fst . T.uncons + +isSp :: Inline -> Bool +isSp Space = True +isSp SoftBreak = True +isSp _ = False + +avoidBadWrapsInList :: [Inline] -> [Inline] +avoidBadWrapsInList [] = [] +avoidBadWrapsInList (s:Str (T.uncons -> Just ('>',cs)):xs) | isSp s = + Str (" >" <> cs) : avoidBadWrapsInList xs +avoidBadWrapsInList [s, Str (T.uncons -> Just (c, cs))] + | T.null cs && isSp s && c `elem` ['-','*','+'] = [Str $ T.pack [' ', c]] +avoidBadWrapsInList (s:Str (T.uncons -> Just (c, cs)):Space:xs) + | T.null cs && isSp s && c `elem` ['-','*','+'] = + Str (T.pack [' ', c]) : Space : avoidBadWrapsInList xs +avoidBadWrapsInList (s:Str cs:Space:xs) + | isSp s && isOrderedListMarker cs = + Str (" " <> cs) : Space : avoidBadWrapsInList xs +avoidBadWrapsInList [s, Str cs] + | isSp s && isOrderedListMarker cs = [Str $ " " <> cs] +avoidBadWrapsInList (x:xs) = x : avoidBadWrapsInList xs + +isOrderedListMarker :: Text -> Bool +isOrderedListMarker xs = not (T.null xs) && (T.last xs `elem` ['.',')']) && + isRight (runParser (anyOrderedListMarker >> eof) + defaultParserState "" xs) + where + isRight (Right _) = True + isRight (Left _) = False + +-- | Convert Pandoc inline element to markdown. +inlineToMarkdown :: PandocMonad m => WriterOptions -> Inline -> MD m (Doc Text) +inlineToMarkdown opts (Span ("",["emoji"],kvs) [Str s]) = + case lookup "data-emoji" kvs of + Just emojiname | isEnabled Ext_emoji opts -> + return $ ":" <> literal emojiname <> ":" + _ -> inlineToMarkdown opts (Str s) +inlineToMarkdown opts (Span attrs ils) = do + variant <- asks envVariant + contents <- inlineListToMarkdown opts ils + return $ case attrs of + (_,["csl-block"],_) -> (cr <>) + (_,["csl-left-margin"],_) -> (cr <>) + (_,["csl-indent"],_) -> (cr <>) + _ -> id + $ case variant of + PlainText -> contents + _ | attrs == nullAttr -> contents + | isEnabled Ext_bracketed_spans opts -> + let attrs' = if attrs /= nullAttr + then attrsToMarkdown attrs + else empty + in "[" <> contents <> "]" <> attrs' + | isEnabled Ext_raw_html opts || + isEnabled Ext_native_spans opts -> + tagWithAttrs "span" attrs <> contents <> literal "</span>" + | otherwise -> contents +inlineToMarkdown _ (Emph []) = return empty +inlineToMarkdown opts (Emph lst) = do + variant <- asks envVariant + contents <- inlineListToMarkdown opts lst + return $ case variant of + PlainText + | isEnabled Ext_gutenberg opts -> "_" <> contents <> "_" + | otherwise -> contents + _ -> "*" <> contents <> "*" +inlineToMarkdown _ (Underline []) = return empty +inlineToMarkdown opts (Underline lst) = do + variant <- asks envVariant + contents <- inlineListToMarkdown opts lst + case variant of + PlainText -> return contents + _ | isEnabled Ext_bracketed_spans opts -> + return $ "[" <> contents <> "]" <> "{.ul}" + | isEnabled Ext_native_spans opts -> + return $ tagWithAttrs "span" ("", ["underline"], []) + <> contents + <> literal "</span>" + | isEnabled Ext_raw_html opts -> + return $ "<u>" <> contents <> "</u>" + | otherwise -> inlineToMarkdown opts (Emph lst) +inlineToMarkdown _ (Strong []) = return empty +inlineToMarkdown opts (Strong lst) = do + variant <- asks envVariant + case variant of + PlainText -> + inlineListToMarkdown opts $ + if isEnabled Ext_gutenberg opts + then capitalize lst + else lst + _ -> do + contents <- inlineListToMarkdown opts lst + return $ "**" <> contents <> "**" +inlineToMarkdown _ (Strikeout []) = return empty +inlineToMarkdown opts (Strikeout lst) = do + contents <- inlineListToMarkdown opts lst + return $ if isEnabled Ext_strikeout opts + then "~~" <> contents <> "~~" + else if isEnabled Ext_raw_html opts + then "<s>" <> contents <> "</s>" + else contents +inlineToMarkdown _ (Superscript []) = return empty +inlineToMarkdown opts (Superscript lst) = + local (\env -> env {envEscapeSpaces = envVariant env == Markdown}) $ do + contents <- inlineListToMarkdown opts lst + if isEnabled Ext_superscript opts + then return $ "^" <> contents <> "^" + else if isEnabled Ext_raw_html opts + then return $ "<sup>" <> contents <> "</sup>" + else + case traverse toSuperscriptInline lst of + Just xs' | not (writerPreferAscii opts) + -> inlineListToMarkdown opts xs' + _ -> do + let rendered = render Nothing contents + return $ + case mapM toSuperscript (T.unpack rendered) of + Just r -> literal $ T.pack r + Nothing -> literal $ "^(" <> rendered <> ")" +inlineToMarkdown _ (Subscript []) = return empty +inlineToMarkdown opts (Subscript lst) = + local (\env -> env {envEscapeSpaces = envVariant env == Markdown}) $ do + contents <- inlineListToMarkdown opts lst + if isEnabled Ext_subscript opts + then return $ "~" <> contents <> "~" + else if isEnabled Ext_raw_html opts + then return $ "<sub>" <> contents <> "</sub>" + else + case traverse toSubscriptInline lst of + Just xs' | not (writerPreferAscii opts) + -> inlineListToMarkdown opts xs' + _ -> do + let rendered = render Nothing contents + return $ + case mapM toSuperscript (T.unpack rendered) of + Just r -> literal $ T.pack r + Nothing -> literal $ "_(" <> rendered <> ")" +inlineToMarkdown opts (SmallCaps lst) = do + variant <- asks envVariant + if variant /= PlainText && + (isEnabled Ext_raw_html opts || isEnabled Ext_native_spans opts) + then inlineToMarkdown opts (Span ("",["smallcaps"],[]) lst) + else inlineListToMarkdown opts $ capitalize lst +inlineToMarkdown opts (Quoted SingleQuote lst) = do + contents <- inlineListToMarkdown opts lst + return $ if isEnabled Ext_smart opts + then "'" <> contents <> "'" + else + if writerPreferAscii opts + then "‘" <> contents <> "’" + else "‘" <> contents <> "’" +inlineToMarkdown opts (Quoted DoubleQuote lst) = do + contents <- inlineListToMarkdown opts lst + return $ if isEnabled Ext_smart opts + then "\"" <> contents <> "\"" + else + if writerPreferAscii opts + then "“" <> contents <> "”" + else "“" <> contents <> "”" +inlineToMarkdown opts (Code attr str) = do + let tickGroups = filter (T.any (== '`')) $ T.group str + let longest = maybe 0 maximum $ nonEmpty $ map T.length tickGroups + let marker = T.replicate (longest + 1) "`" + let spacer = if longest == 0 then "" else " " + let attrsEnabled = isEnabled Ext_inline_code_attributes opts || + isEnabled Ext_attributes opts + let attrs = if attrsEnabled && attr /= nullAttr + then attrsToMarkdown attr + else empty + variant <- asks envVariant + case variant of + PlainText -> return $ literal str + _ -> return $ literal + (marker <> spacer <> str <> spacer <> marker) <> attrs +inlineToMarkdown opts (Str str) = do + variant <- asks envVariant + let str' = (if writerPreferAscii opts + then toHtml5Entities + else id) . + (if isEnabled Ext_smart opts + then unsmartify opts + else id) . + (if variant == PlainText + then id + else escapeText opts) $ str + return $ literal str' +inlineToMarkdown opts (Math InlineMath str) = + case writerHTMLMathMethod opts of + WebTeX url -> inlineToMarkdown opts + (Image nullAttr [Str str] (url <> T.pack (urlEncode $ T.unpack str), str)) + _ | isEnabled Ext_tex_math_dollars opts -> + return $ "$" <> literal str <> "$" + | isEnabled Ext_tex_math_single_backslash opts -> + return $ "\\(" <> literal str <> "\\)" + | isEnabled Ext_tex_math_double_backslash opts -> + return $ "\\\\(" <> literal str <> "\\\\)" + | otherwise -> do + variant <- asks envVariant + texMathToInlines InlineMath str >>= + inlineListToMarkdown opts . + (if variant == PlainText then makeMathPlainer else id) +inlineToMarkdown opts (Math DisplayMath str) = + case writerHTMLMathMethod opts of + WebTeX url -> (\x -> blankline <> x <> blankline) `fmap` + inlineToMarkdown opts (Image nullAttr [Str str] + (url <> T.pack (urlEncode $ T.unpack str), str)) + _ | isEnabled Ext_tex_math_dollars opts -> + return $ "$$" <> literal str <> "$$" + | isEnabled Ext_tex_math_single_backslash opts -> + return $ "\\[" <> literal str <> "\\]" + | isEnabled Ext_tex_math_double_backslash opts -> + return $ "\\\\[" <> literal str <> "\\\\]" + | otherwise -> (\x -> cr <> x <> cr) `fmap` + (texMathToInlines DisplayMath str >>= inlineListToMarkdown opts) +inlineToMarkdown opts il@(RawInline f str) = do + let tickGroups = filter (T.any (== '`')) $ T.group str + let numticks = 1 + maybe 0 maximum (nonEmpty (map T.length tickGroups)) + variant <- asks envVariant + let Format fmt = f + let rawAttribInline = return $ + literal (T.replicate numticks "`") <> literal str <> + literal (T.replicate numticks "`") <> literal "{=" <> literal fmt <> literal "}" + let renderEmpty = mempty <$ report (InlineNotRendered il) + case variant of + PlainText -> renderEmpty + Commonmark + | f `elem` ["gfm", "commonmark", "commonmark_x", "markdown"] + -> return $ literal str + Markdown + | f `elem` ["markdown", "markdown_github", "markdown_phpextra", + "markdown_mmd", "markdown_strict"] + -> return $ literal str + _ | isEnabled Ext_raw_attribute opts -> rawAttribInline + | f `elem` ["html", "html5", "html4"] + , isEnabled Ext_raw_html opts + -> return $ literal str + | f `elem` ["latex", "tex"] + , isEnabled Ext_raw_tex opts + -> return $ literal str + _ -> renderEmpty + + +inlineToMarkdown opts LineBreak = do + variant <- asks envVariant + if variant == PlainText || isEnabled Ext_hard_line_breaks opts + then return cr + else return $ + if isEnabled Ext_escaped_line_breaks opts + then "\\" <> cr + else " " <> cr +inlineToMarkdown _ Space = do + escapeSpaces <- asks envEscapeSpaces + return $ if escapeSpaces then "\\ " else space +inlineToMarkdown opts SoftBreak = do + escapeSpaces <- asks envEscapeSpaces + let space' = if escapeSpaces then "\\ " else space + return $ case writerWrapText opts of + WrapNone -> space' + WrapAuto -> space' + WrapPreserve -> cr +inlineToMarkdown opts (Cite [] lst) = inlineListToMarkdown opts lst +inlineToMarkdown opts (Cite (c:cs) lst) + | not (isEnabled Ext_citations opts) = inlineListToMarkdown opts lst + | otherwise = + if citationMode c == AuthorInText + then do + suffs <- inlineListToMarkdown opts $ citationSuffix c + rest <- mapM convertOne cs + let inbr = suffs <+> joincits rest + br = if isEmpty inbr then empty else char '[' <> inbr <> char ']' + return $ literal ("@" <> maybeInBraces (citationId c)) <+> br + else do + cits <- mapM convertOne (c:cs) + return $ literal "[" <> joincits cits <> literal "]" + where + maybeInBraces key = + case readWith (citeKey False >> spaces >> eof) + defaultParserState ("@" <> key) of + Left _ -> "{" <> key <> "}" + Right _ -> key + joincits = hcat . intersperse (literal "; ") . filter (not . isEmpty) + convertOne Citation { citationId = k + , citationPrefix = pinlines + , citationSuffix = sinlines + , citationMode = m } + = do + pdoc <- inlineListToMarkdown opts pinlines + sdoc <- inlineListToMarkdown opts sinlines + let k' = literal (modekey m <> "@" <> maybeInBraces k) + r = case sinlines of + Str (T.uncons -> Just (y,_)):_ | y `elem` (",;]@" :: String) -> k' <> sdoc + _ -> k' <+> sdoc + return $ pdoc <+> r + modekey SuppressAuthor = "-" + modekey _ = "" +inlineToMarkdown opts lnk@(Link attr txt (src, tit)) = do + variant <- asks envVariant + linktext <- inlineListToMarkdown opts txt + let linktitle = if T.null tit + then empty + else literal $ " \"" <> tit <> "\"" + let srcSuffix = fromMaybe src (T.stripPrefix "mailto:" src) + let useAuto = isURI src && + case txt of + [Str s] | escapeURI s == srcSuffix -> True + _ -> False + let useRefLinks = writerReferenceLinks opts && not useAuto + shortcutable <- asks envRefShortcutable + let useShortcutRefLinks = shortcutable && + isEnabled Ext_shortcut_reference_links opts + reftext <- if useRefLinks + then literal <$> getReference attr linktext (src, tit) + else return mempty + case variant of + PlainText + | useAuto -> return $ literal srcSuffix + | otherwise -> return linktext + _ | useAuto -> return $ "<" <> literal srcSuffix <> ">" + | useRefLinks -> + let first = "[" <> linktext <> "]" + second = if getKey linktext == getKey reftext + then if useShortcutRefLinks + then "" + else "[]" + else "[" <> reftext <> "]" + in return $ first <> second + | isEnabled Ext_raw_html opts + , not (isEnabled Ext_link_attributes opts || isEnabled Ext_attributes opts) + , attr /= nullAttr -> -- use raw HTML to render attributes + literal . T.strip <$> + writeHtml5String opts{ writerTemplate = Nothing } + (Pandoc nullMeta [Plain [lnk]]) + | otherwise -> return $ + "[" <> linktext <> "](" <> literal src <> linktitle <> ")" <> + linkAttributes opts attr +inlineToMarkdown opts img@(Image attr alternate (source, tit)) + | isEnabled Ext_raw_html opts && + not (isEnabled Ext_link_attributes opts || isEnabled Ext_attributes opts) && + attr /= nullAttr = -- use raw HTML + literal . T.strip <$> + writeHtml5String opts{ writerTemplate = Nothing } (Pandoc nullMeta [Plain [img]]) + | otherwise = do + variant <- asks envVariant + let txt = if null alternate || alternate == [Str source] + -- to prevent autolinks + then [Str ""] + else alternate + linkPart <- inlineToMarkdown opts (Link attr txt (source, tit)) + return $ case variant of + PlainText -> "[" <> linkPart <> "]" + _ -> "!" <> linkPart +inlineToMarkdown opts (Note contents) = do + modify (\st -> st{ stNotes = contents : stNotes st }) + st <- get + let ref = literal $ writerIdentifierPrefix opts <> tshow (stNoteNum st + length (stNotes st) - 1) + if isEnabled Ext_footnotes opts + then return $ "[^" <> ref <> "]" + else return $ "[" <> ref <> "]" + +makeMathPlainer :: [Inline] -> [Inline] +makeMathPlainer = walk go + where + go (Emph xs) = Span nullAttr xs + go x = x + +toSubscriptInline :: Inline -> Maybe Inline +toSubscriptInline Space = Just Space +toSubscriptInline (Span attr ils) = Span attr <$> traverse toSubscriptInline ils +toSubscriptInline (Str s) = Str . T.pack <$> traverse toSubscript (T.unpack s) +toSubscriptInline LineBreak = Just LineBreak +toSubscriptInline SoftBreak = Just SoftBreak +toSubscriptInline _ = Nothing + +toSuperscriptInline :: Inline -> Maybe Inline +toSuperscriptInline Space = Just Space +toSuperscriptInline (Span attr ils) = Span attr <$> traverse toSuperscriptInline ils +toSuperscriptInline (Str s) = Str . T.pack <$> traverse toSuperscript (T.unpack s) +toSuperscriptInline LineBreak = Just LineBreak +toSuperscriptInline SoftBreak = Just SoftBreak +toSuperscriptInline _ = Nothing diff --git a/src/Text/Pandoc/Writers/Markdown/Types.hs b/src/Text/Pandoc/Writers/Markdown/Types.hs new file mode 100644 index 000000000..a1d0d14e4 --- /dev/null +++ b/src/Text/Pandoc/Writers/Markdown/Types.hs @@ -0,0 +1,81 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Writers.Markdown.Types + Copyright : Copyright (C) 2006-2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable +-} +module Text.Pandoc.Writers.Markdown.Types ( + MarkdownVariant(..), + WriterState(..), + WriterEnv(..), + Notes, + Ref, + Refs, + MD, + evalMD + ) where +import Control.Monad.Reader +import Control.Monad.State.Strict +import Data.Default +import qualified Data.Map as M +import qualified Data.Set as Set +import Data.Text (Text) +import Text.Pandoc.Parsing (Key) +import Text.Pandoc.Class.PandocMonad (PandocMonad) +import Text.Pandoc.Definition + +type Notes = [[Block]] +type Ref = (Text, Target, Attr) +type Refs = [Ref] + +type MD m = ReaderT WriterEnv (StateT WriterState m) + +evalMD :: PandocMonad m => MD m a -> WriterEnv -> WriterState -> m a +evalMD md env st = evalStateT (runReaderT md env) st + +data WriterEnv = WriterEnv { envInList :: Bool + , envVariant :: MarkdownVariant + , envRefShortcutable :: Bool + , envBlockLevel :: Int + , envEscapeSpaces :: Bool + } + +data MarkdownVariant = + PlainText + | Commonmark + | Markdown + deriving (Show, Eq) + +instance Default WriterEnv + where def = WriterEnv { envInList = False + , envVariant = Markdown + , envRefShortcutable = True + , envBlockLevel = 0 + , envEscapeSpaces = False + } + +data WriterState = WriterState { stNotes :: Notes + , stPrevRefs :: Refs + , stRefs :: Refs + , stKeys :: M.Map Key + (M.Map (Target, Attr) Int) + , stLastIdx :: Int + , stIds :: Set.Set Text + , stNoteNum :: Int + } + +instance Default WriterState + where def = WriterState{ stNotes = [] + , stPrevRefs = [] + , stRefs = [] + , stKeys = M.empty + , stLastIdx = 0 + , stIds = Set.empty + , stNoteNum = 1 + } + + diff --git a/src/Text/Pandoc/Writers/MediaWiki.hs b/src/Text/Pandoc/Writers/MediaWiki.hs index d1912caa6..5029be69f 100644 --- a/src/Text/Pandoc/Writers/MediaWiki.hs +++ b/src/Text/Pandoc/Writers/MediaWiki.hs @@ -2,7 +2,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Writers.MediaWiki - Copyright : Copyright (C) 2008-2020 John MacFarlane + Copyright : Copyright (C) 2008-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Writers/Ms.hs b/src/Text/Pandoc/Writers/Ms.hs index 96914d3c6..97c23f24d 100644 --- a/src/Text/Pandoc/Writers/Ms.hs +++ b/src/Text/Pandoc/Writers/Ms.hs @@ -2,7 +2,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Writers.Ms - Copyright : Copyright (C) 2007-2020 John MacFarlane + Copyright : Copyright (C) 2007-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -23,6 +23,7 @@ module Text.Pandoc.Writers.Ms ( writeMs ) where import Control.Monad.State.Strict import Data.Char (isLower, isUpper, ord) import Data.List (intercalate, intersperse) +import Data.List.NonEmpty (nonEmpty) import qualified Data.Map as Map import Data.Maybe (catMaybes) import Data.Text (Text) @@ -244,13 +245,17 @@ blockToMs opts (Table _ blkCapt specs thead tbody tfoot) = aligncode AlignDefault = "l" in do caption' <- inlineListToMs' opts caption - let iwidths = if all (== 0) widths - then repeat "" - else map (T.pack . printf "w(%0.1fn)" . (70 *)) widths + let isSimple = all (== 0) widths + let totalWidth = 70 -- 78n default width - 8n indent = 70n let coldescriptions = literal $ T.unwords - (zipWith (\align width -> aligncode align <> width) - alignments iwidths) <> "." + (zipWith (\align width -> aligncode align <> + if width == 0 + then "" + else T.pack $ + printf "w(%0.1fn)" + (totalWidth * width)) + alignments widths) <> "." colheadings <- mapM (blockListToMs opts) headers let makeRow cols = literal "T{" $$ vcat (intersperse (literal "T}\tT{") cols) $$ @@ -259,12 +264,26 @@ blockToMs opts (Table _ blkCapt specs thead tbody tfoot) = then empty else makeRow colheadings $$ char '_' body <- mapM (\row -> do - cols <- mapM (blockListToMs opts) row + cols <- mapM (\(cell, w) -> + (if isSimple + then id + else (literal (".nr LL " <> + T.pack (printf "%0.1fn" + (w * totalWidth))) $$)) <$> + blockListToMs opts cell) (zip row widths) return $ makeRow cols) rows setFirstPara return $ literal ".PP" $$ caption' $$ + literal ".na" $$ -- we don't want justification in table cells + (if isSimple + then "" + else ".nr LLold \\n[LL]") $$ literal ".TS" $$ literal "delim(@@) tab(\t);" $$ coldescriptions $$ - colheadings' $$ vcat body $$ literal ".TE" + colheadings' $$ vcat body $$ literal ".TE" $$ + (if isSimple + then "" + else ".nr LL \\n[LLold]") $$ + literal ".ad" blockToMs opts (BulletList items) = do contents <- mapM (bulletListItemToMs opts) items @@ -272,8 +291,7 @@ blockToMs opts (BulletList items) = do return (vcat contents) blockToMs opts (OrderedList attribs items) = do let markers = take (length items) $ orderedListMarkers attribs - let indent = 2 + - maximum (map T.length markers) + let indent = 2 + maybe 0 maximum (nonEmpty (map T.length markers)) contents <- mapM (\(num, item) -> orderedListItemToMs opts num indent item) $ zip markers items setFirstPara diff --git a/src/Text/Pandoc/Writers/Muse.hs b/src/Text/Pandoc/Writers/Muse.hs index bf3265107..d5100f43f 100644 --- a/src/Text/Pandoc/Writers/Muse.hs +++ b/src/Text/Pandoc/Writers/Muse.hs @@ -31,6 +31,7 @@ import Control.Monad.State.Strict import Data.Char (isAlphaNum, isAsciiLower, isAsciiUpper, isDigit, isSpace) import Data.Default import Data.List (intersperse, transpose) +import Data.List.NonEmpty (nonEmpty, NonEmpty(..)) import qualified Data.Set as Set import qualified Data.Text as T import Data.Text (Text) @@ -158,7 +159,8 @@ simpleTable caption headers rows = do caption' <- inlineListToMuse caption headers' <- mapM blockListToMuse headers rows' <- mapM (mapM blockListToMuse) rows - let widthsInChars = maximum . map offset <$> transpose (headers' : rows') + let widthsInChars = maybe 0 maximum . nonEmpty . map offset <$> + transpose (headers' : rows') let hpipeBlocks sep blocks = hcat $ intersperse sep' blocks where sep' = lblock (T.length sep) $ literal sep let makeRow sep = hpipeBlocks sep . zipWith lblock widthsInChars @@ -238,7 +240,7 @@ blockToMuse (DefinitionList items) = do label' <- local (\env -> env { envOneLine = True, envAfterSpace = True }) $ inlineListToMuse' label let ind = offset' label' -- using Text.DocLayout.offset results in round trip failures hang ind (nowrap label') . vcat <$> mapM descriptionToMuse defs - where offset' d = maximum (0: map T.length + where offset' d = maximum (0 :| map T.length (T.lines $ render Nothing d)) descriptionToMuse :: PandocMonad m => [Block] @@ -269,7 +271,8 @@ blockToMuse (Table _ blkCapt specs thead tbody tfoot) = (caption, aligns, widths, headers, rows) = toLegacyTable blkCapt specs thead tbody tfoot blocksToDoc opts blocks = local (\env -> env { envOptions = opts }) $ blockListToMuse blocks - numcols = maximum (length aligns : length widths : map length (headers:rows)) + numcols = maximum + (length aligns :| length widths : map length (headers:rows)) isSimple = onlySimpleTableCells (headers : rows) && all (== 0) widths blockToMuse (Div _ bs) = flatBlockListToMuse bs blockToMuse Null = return empty diff --git a/src/Text/Pandoc/Writers/Native.hs b/src/Text/Pandoc/Writers/Native.hs index 4d4dfca15..9c2ce805d 100644 --- a/src/Text/Pandoc/Writers/Native.hs +++ b/src/Text/Pandoc/Writers/Native.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Writers.Native - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Writers/ODT.hs b/src/Text/Pandoc/Writers/ODT.hs index e41fb7176..e4eb4fd25 100644 --- a/src/Text/Pandoc/Writers/ODT.hs +++ b/src/Text/Pandoc/Writers/ODT.hs @@ -2,7 +2,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Writers.ODT - Copyright : Copyright (C) 2008-2020 John MacFarlane + Copyright : Copyright (C) 2008-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -13,9 +13,10 @@ Conversion of 'Pandoc' documents to ODT. -} module Text.Pandoc.Writers.ODT ( writeODT ) where import Codec.Archive.Zip -import Control.Monad.Except (catchError) +import Control.Monad.Except (catchError, throwError) import Control.Monad.State.Strict import qualified Data.ByteString.Lazy as B +import Data.Maybe (fromMaybe) import Data.Generics (everywhere', mkT) import Data.List (isPrefixOf) import qualified Data.Map as Map @@ -23,10 +24,11 @@ import qualified Data.Text as T import qualified Data.Text.Lazy as TL import Data.Time import System.FilePath (takeDirectory, takeExtension, (<.>)) -import Text.Pandoc.BCP47 (Lang (..), getLang, renderLang) +import Text.Collate.Lang (Lang (..), renderLang) import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang) import qualified Text.Pandoc.Class.PandocMonad as P import Text.Pandoc.Definition +import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.ImageSize import Text.Pandoc.Logging import Text.Pandoc.MIME (extensionFromMimeType, getMimeType) @@ -34,13 +36,14 @@ import Text.Pandoc.Options (WrapOption (..), WriterOptions (..)) import Text.DocLayout import Text.Pandoc.Shared (stringify, pandocVersion, tshow) import Text.Pandoc.Writers.Shared (lookupMetaString, lookupMetaBlocks, - fixDisplayMath) -import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toStringLazy) + fixDisplayMath, getLang) +import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toTextLazy) import Text.Pandoc.Walk import Text.Pandoc.Writers.OpenDocument (writeOpenDocument) import Text.Pandoc.XML +import Text.Pandoc.XML.Light import Text.TeXMath -import Text.XML.Light +import qualified Text.XML.Light as XL newtype ODTState = ODTState { stEntries :: [Entry] } @@ -66,7 +69,7 @@ pandocToODT :: PandocMonad m pandocToODT opts doc@(Pandoc meta _) = do let title = docTitle meta let authors = docAuthors meta - utctime <- P.getCurrentTime + utctime <- P.getTimestamp lang <- toLang (getLang opts meta) refArchive <- case writerReferenceDoc opts of @@ -172,24 +175,27 @@ updateStyleWithLang :: PandocMonad m => Maybe Lang -> Archive -> O m Archive updateStyleWithLang Nothing arch = return arch updateStyleWithLang (Just lang) arch = do epochtime <- floor `fmap` lift P.getPOSIXTime - return arch{ zEntries = [if eRelativePath e == "styles.xml" - then case parseXMLDoc - (toStringLazy (fromEntry e)) of - Nothing -> e - Just d -> - toEntry "styles.xml" epochtime - ( fromStringLazy - . ppTopElement - . addLang lang $ d ) - else e - | e <- zEntries arch] } + entries <- mapM (\e -> if eRelativePath e == "styles.xml" + then case parseXMLElement + (toTextLazy (fromEntry e)) of + Left msg -> throwError $ + PandocXMLError "styles.xml" msg + Right d -> return $ + toEntry "styles.xml" epochtime + ( fromTextLazy + . TL.fromStrict + . ppTopElement + . addLang lang $ d ) + else return e) (zEntries arch) + return arch{ zEntries = entries } +-- TODO FIXME avoid this generic traversal! addLang :: Lang -> Element -> Element addLang lang = everywhere' (mkT updateLangAttr) where updateLangAttr (Attr n@(QName "language" _ (Just "fo")) _) - = Attr n (T.unpack $ langLanguage lang) + = Attr n (langLanguage lang) updateLangAttr (Attr n@(QName "country" _ (Just "fo")) _) - = Attr n (T.unpack $ langRegion lang) + = Attr n (fromMaybe "" $ langRegion lang) updateLangAttr x = x -- | transform both Image and Math elements @@ -235,8 +241,8 @@ transformPicMath _ (Math t math) = do case writeMathML dt <$> readTeX math of Left _ -> return $ Math t math Right r -> do - let conf = useShortEmptyTags (const False) defaultConfigPP - let mathml = ppcTopElement conf r + let conf = XL.useShortEmptyTags (const False) XL.defaultConfigPP + let mathml = XL.ppcTopElement conf r epochtime <- floor `fmap` lift P.getPOSIXTime let dirname = "Formula-" ++ show (length entries) ++ "/" let fname = dirname ++ "content.xml" diff --git a/src/Text/Pandoc/Writers/OOXML.hs b/src/Text/Pandoc/Writers/OOXML.hs index ac991b594..0533d6c12 100644 --- a/src/Text/Pandoc/Writers/OOXML.hs +++ b/src/Text/Pandoc/Writers/OOXML.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Writers.OOXML - Copyright : Copyright (C) 2012-2020 John MacFarlane + Copyright : Copyright (C) 2012-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -29,32 +29,32 @@ import Control.Monad.Except (throwError) import Text.Pandoc.Error import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as BL -import qualified Data.ByteString.Lazy.Char8 as BL8 import Data.Maybe (mapMaybe) import qualified Data.Text as T +import Data.Text (Text) import Text.Pandoc.Class.PandocMonad (PandocMonad) import qualified Text.Pandoc.UTF8 as UTF8 -import Text.XML.Light as XML +import Text.Pandoc.XML.Light -mknode :: Node t => String -> [(String,String)] -> t -> Element +mknode :: Node t => Text -> [(Text,Text)] -> t -> Element mknode s attrs = add_attrs (map (\(k,v) -> Attr (nodename k) v) attrs) . node (nodename s) -mktnode :: String -> [(String,String)] -> T.Text -> Element -mktnode s attrs = mknode s attrs . T.unpack +mktnode :: Text -> [(Text,Text)] -> T.Text -> Element +mktnode s attrs = mknode s attrs -nodename :: String -> QName +nodename :: Text -> QName nodename s = QName{ qName = name, qURI = Nothing, qPrefix = prefix } - where (name, prefix) = case break (==':') s of - (xs,[]) -> (xs, Nothing) - (ys, _:zs) -> (zs, Just ys) + where (name, prefix) = case T.break (==':') s of + (xs,ys) -> case T.uncons ys of + Nothing -> (xs, Nothing) + Just (_,zs) -> (zs, Just xs) toLazy :: B.ByteString -> BL.ByteString toLazy = BL.fromChunks . (:[]) renderXml :: Element -> BL.ByteString -renderXml elt = BL8.pack "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" <> - UTF8.fromStringLazy (showElement elt) +renderXml elt = BL.fromStrict (UTF8.fromText (showTopElement elt)) parseXml :: PandocMonad m => Archive -> Archive -> String -> m Element parseXml refArchive distArchive relpath = @@ -62,32 +62,32 @@ parseXml refArchive distArchive relpath = findEntryByPath relpath distArchive of Nothing -> throwError $ PandocSomeError $ T.pack relpath <> " missing in reference file" - Just e -> case parseXMLDoc . UTF8.toStringLazy . fromEntry $ e of - Nothing -> throwError $ PandocSomeError $ - T.pack relpath <> " corrupt in reference file" - Just d -> return d + Just e -> case parseXMLElement . UTF8.toTextLazy . fromEntry $ e of + Left msg -> + throwError $ PandocXMLError (T.pack relpath) msg + Right d -> return d -- Copied from Util -attrToNSPair :: XML.Attr -> Maybe (String, String) -attrToNSPair (XML.Attr (QName s _ (Just "xmlns")) val) = Just (s, val) +attrToNSPair :: Attr -> Maybe (Text, Text) +attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) attrToNSPair _ = Nothing elemToNameSpaces :: Element -> NameSpaces elemToNameSpaces = mapMaybe attrToNSPair . elAttribs -elemName :: NameSpaces -> String -> String -> QName +elemName :: NameSpaces -> Text -> Text -> QName elemName ns prefix name = - QName name (lookup prefix ns) (if null prefix then Nothing else Just prefix) + QName name (lookup prefix ns) (if T.null prefix then Nothing else Just prefix) -isElem :: NameSpaces -> String -> String -> Element -> Bool +isElem :: NameSpaces -> Text -> Text -> Element -> Bool isElem ns prefix name element = let ns' = ns ++ elemToNameSpaces element in qName (elName element) == name && qURI (elName element) == lookup prefix ns' -type NameSpaces = [(String, String)] +type NameSpaces = [(Text, Text)] -- | Scales the image to fit the page -- sizes are passed in emu diff --git a/src/Text/Pandoc/Writers/OPML.hs b/src/Text/Pandoc/Writers/OPML.hs index 810a94775..8c9229fc0 100644 --- a/src/Text/Pandoc/Writers/OPML.hs +++ b/src/Text/Pandoc/Writers/OPML.hs @@ -2,7 +2,7 @@ {-# LANGUAGE CPP #-} {- | Module : Text.Pandoc.Writers.OPML - Copyright : Copyright (C) 2013-2020 John MacFarlane + Copyright : Copyright (C) 2013-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Writers/OpenDocument.hs b/src/Text/Pandoc/Writers/OpenDocument.hs index 071a5542f..5f3224c2f 100644 --- a/src/Text/Pandoc/Writers/OpenDocument.hs +++ b/src/Text/Pandoc/Writers/OpenDocument.hs @@ -1,4 +1,5 @@ {-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE ViewPatterns #-} @@ -17,6 +18,7 @@ module Text.Pandoc.Writers.OpenDocument ( writeOpenDocument ) where import Control.Arrow ((***), (>>>)) import Control.Monad.State.Strict hiding (when) import Data.Char (chr) +import Data.Foldable (find) import Data.List (sortOn, sortBy, foldl') import qualified Data.Map as Map import Data.Maybe (fromMaybe, isNothing) @@ -24,7 +26,7 @@ import Data.Ord (comparing) import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T -import Text.Pandoc.BCP47 (Lang (..), parseBCP47) +import Text.Collate.Lang (Lang (..), parseLang) import Text.Pandoc.Class.PandocMonad (PandocMonad, report, translateTerm, setTranslations, toLang) import Text.Pandoc.Definition @@ -35,6 +37,7 @@ import Text.DocLayout import Text.Pandoc.Shared (linesToPara, tshow, blocksToInlines) import Text.Pandoc.Templates (renderTemplate) import qualified Text.Pandoc.Translations as Term (Term(Figure, Table)) +import Text.Pandoc.Walk import Text.Pandoc.Writers.Math import Text.Pandoc.Writers.Shared import qualified Text.Pandoc.Writers.AnnotatedTable as Ann @@ -54,6 +57,11 @@ plainToPara x = x type OD m = StateT WriterState m +data ReferenceType + = HeaderRef + | TableRef + | ImageRef + data WriterState = WriterState { stNotes :: [Doc Text] , stTableStyles :: [Doc Text] @@ -69,6 +77,7 @@ data WriterState = , stImageId :: Int , stTableCaptionId :: Int , stImageCaptionId :: Int + , stIdentTypes :: [(Text,ReferenceType)] } defaultWriterState :: WriterState @@ -86,6 +95,7 @@ defaultWriterState = , stImageId = 1 , stTableCaptionId = 1 , stImageCaptionId = 1 + , stIdentTypes = [] } when :: Bool -> Doc Text -> Doc Text @@ -227,7 +237,7 @@ handleSpaces s = case T.uncons s of -- | Convert Pandoc document to string in OpenDocument format. writeOpenDocument :: PandocMonad m => WriterOptions -> Pandoc -> m Text writeOpenDocument opts (Pandoc meta blocks) = do - let defLang = Lang "en" "US" "" [] + let defLang = Lang "en" (Just "US") Nothing [] [] [] lang <- case lookupMetaString "lang" meta of "" -> pure defLang s -> fromMaybe defLang <$> toLang (Just s) @@ -243,6 +253,12 @@ writeOpenDocument opts (Pandoc meta blocks) = do meta ((body, metadata),s) <- flip runStateT defaultWriterState $ do + let collectInlineIdent (Image (ident,_,_) _ _) = [(ident,ImageRef)] + collectInlineIdent _ = [] + let collectBlockIdent (Header _ (ident,_,_) _) = [(ident,HeaderRef)] + collectBlockIdent (Table (ident,_,_) _ _ _ _ _) = [(ident,TableRef)] + collectBlockIdent _ = [] + modify $ \s -> s{ stIdentTypes = query collectBlockIdent blocks ++ query collectInlineIdent blocks } m <- metaToContext opts (blocksToOpenDocument opts) (fmap chomp . inlinesToOpenDocument opts) @@ -357,36 +373,32 @@ blocksToOpenDocument o b = vcat <$> mapM (blockToOpenDocument o) b -- | Convert a Pandoc block element to OpenDocument. blockToOpenDocument :: PandocMonad m => WriterOptions -> Block -> OD m (Doc Text) -blockToOpenDocument o bs - | Plain b <- bs = if null b - then return empty - else inParagraphTags =<< inlinesToOpenDocument o b - | Para [Image attr c (s,T.stripPrefix "fig:" -> Just t)] <- bs - = figure attr c s t - | Para b <- bs = if null b && - not (isEnabled Ext_empty_paragraphs o) - then return empty - else inParagraphTags =<< inlinesToOpenDocument o b - | LineBlock b <- bs = blockToOpenDocument o $ linesToPara b - | Div attr xs <- bs = mkDiv attr xs - | Header i (ident,_,_) b - <- bs = setFirstPara >> (inHeaderTags i ident - =<< inlinesToOpenDocument o b) - | BlockQuote b <- bs = setFirstPara >> mkBlockQuote b - | DefinitionList b <- bs = setFirstPara >> defList b - | BulletList b <- bs = setFirstPara >> bulletListToOpenDocument o b - | OrderedList a b <- bs = setFirstPara >> orderedList a b - | CodeBlock _ s <- bs = setFirstPara >> preformatted s - | Table a bc s th tb tf <- bs = setFirstPara >> table (Ann.toTable a bc s th tb tf) - | HorizontalRule <- bs = setFirstPara >> return (selfClosingTag "text:p" - [ ("text:style-name", "Horizontal_20_Line") ]) - | RawBlock f s <- bs = if f == Format "opendocument" - then return $ text $ T.unpack s - else do - report $ BlockNotRendered bs - return empty - | Null <- bs = return empty - | otherwise = return empty +blockToOpenDocument o = \case + Plain b -> if null b + then return empty + else inParagraphTags =<< inlinesToOpenDocument o b + Para [Image attr c (s,T.stripPrefix "fig:" -> Just t)] -> figure attr c s t + Para b -> if null b && + not (isEnabled Ext_empty_paragraphs o) + then return empty + else inParagraphTags =<< inlinesToOpenDocument o b + LineBlock b -> blockToOpenDocument o $ linesToPara b + Div attr xs -> mkDiv attr xs + Header i (ident,_,_) b -> do + setFirstPara + inHeaderTags i ident =<< inlinesToOpenDocument o b + BlockQuote b -> setFirstPara >> mkBlockQuote b + DefinitionList b -> setFirstPara >> defList b + BulletList b -> setFirstPara >> bulletListToOpenDocument o b + OrderedList a b -> setFirstPara >> orderedList a b + CodeBlock _ s -> setFirstPara >> preformatted s + Table a bc s th tb tf -> setFirstPara >> table (Ann.toTable a bc s th tb tf) + HorizontalRule -> setFirstPara >> return (selfClosingTag "text:p" + [ ("text:style-name", "Horizontal_20_Line") ]) + b@(RawBlock f s) -> if f == Format "opendocument" + then return $ text $ T.unpack s + else empty <$ report (BlockNotRendered b) + Null -> return empty where defList b = do setInDefinitionList True r <- vcat <$> mapM (deflistItemToOpenDocument o) b @@ -411,7 +423,7 @@ blockToOpenDocument o bs inTags True "text:list" [ ("text:style-name", "L" <> tshow ln)] <$> orderedListToOpenDocument o pn b table :: PandocMonad m => Ann.Table -> OD m (Doc Text) - table (Ann.Table _ (Caption _ c) colspecs thead tbodies _) = do + table (Ann.Table (ident, _, _) (Caption _ c) colspecs thead tbodies _) = do tn <- length <$> gets stTableStyles pn <- length <$> gets stParaStyles let genIds = map chr [65..] @@ -433,7 +445,7 @@ blockToOpenDocument o bs then return empty else inlinesToOpenDocument o (blocksToInlines c) >>= if isEnabled Ext_native_numbering o - then numberedTableCaption + then numberedTableCaption ident else unNumberedCaption "TableCaption" th <- colHeadsToOpenDocument o (map fst paraHStyles) thead tr <- mapM (tableBodyToOpenDocument o (map fst paraStyles)) tbodies @@ -442,36 +454,39 @@ blockToOpenDocument o bs , ("table:style-name", name) ] (vcat columns $$ th $$ vcat tr) return $ captionDoc $$ tableDoc - figure attr caption source title | null caption = + figure attr@(ident, _, _) caption source title | null caption = withParagraphStyle o "Figure" [Para [Image attr caption (source,title)]] | otherwise = do imageDoc <- withParagraphStyle o "FigureWithCaption" [Para [Image attr caption (source,title)]] captionDoc <- inlinesToOpenDocument o caption >>= if isEnabled Ext_native_numbering o - then numberedFigureCaption + then numberedFigureCaption ident else unNumberedCaption "FigureCaption" return $ imageDoc $$ captionDoc -numberedTableCaption :: PandocMonad m => Doc Text -> OD m (Doc Text) -numberedTableCaption caption = do +numberedTableCaption :: PandocMonad m => Text -> Doc Text -> OD m (Doc Text) +numberedTableCaption ident caption = do id' <- gets stTableCaptionId modify (\st -> st{ stTableCaptionId = id' + 1 }) capterm <- translateTerm Term.Table - return $ numberedCaption "TableCaption" capterm "Table" id' caption + return $ numberedCaption "TableCaption" capterm "Table" id' ident caption -numberedFigureCaption :: PandocMonad m => Doc Text -> OD m (Doc Text) -numberedFigureCaption caption = do +numberedFigureCaption :: PandocMonad m => Text -> Doc Text -> OD m (Doc Text) +numberedFigureCaption ident caption = do id' <- gets stImageCaptionId modify (\st -> st{ stImageCaptionId = id' + 1 }) capterm <- translateTerm Term.Figure - return $ numberedCaption "FigureCaption" capterm "Illustration" id' caption + return $ numberedCaption "FigureCaption" capterm "Illustration" id' ident caption -numberedCaption :: Text -> Text -> Text -> Int -> Doc Text -> Doc Text -numberedCaption style term name num caption = +numberedCaption :: Text -> Text -> Text -> Int -> Text -> Doc Text -> Doc Text +numberedCaption style term name num ident caption = let t = text $ T.unpack term r = num - 1 - s = inTags False "text:sequence" [ ("text:ref-name", "ref" <> name <> tshow r), + ident' = case ident of + "" -> "ref" <> name <> tshow r + _ -> ident + s = inTags False "text:sequence" [ ("text:ref-name", ident'), ("text:name", name), ("text:formula", "ooow:" <> name <> "+1"), ("style:num-format", "1") ] $ text $ show num @@ -607,7 +622,9 @@ inlineToOpenDocument o ils else do report $ InlineNotRendered ils return empty - Link _ l (s,t) -> mkLink s t <$> inlinesToOpenDocument o l + Link _ l (s,t) -> do + identTypes <- gets stIdentTypes + mkLink o identTypes s t <$> inlinesToOpenDocument o l Image attr _ (s,t) -> mkImg attr s t Note l -> mkNote l where @@ -619,10 +636,6 @@ inlineToOpenDocument o ils unhighlighted s = inlinedCode $ preformatted s preformatted s = handleSpaces $ escapeStringForXML s inlinedCode s = return $ inTags False "text:span" [("text:style-name", "Source_Text")] s - mkLink s t = inTags False "text:a" [ ("xlink:type" , "simple") - , ("xlink:href" , s ) - , ("office:name", t ) - ] . inSpanTags "Definition" mkImg (_, _, kvs) s _ = do id' <- gets stImageId modify (\st -> st{ stImageId = id' + 1 }) @@ -659,6 +672,45 @@ inlineToOpenDocument o ils addNote nn return nn +mkLink :: WriterOptions -> [(Text,ReferenceType)] -> Text -> Text -> Doc Text -> Doc Text +mkLink o identTypes s t d = + let maybeIdentAndType = case T.uncons s of + Just ('#', ident) -> find ((ident ==) . fst) identTypes + _ -> Nothing + d' = inSpanTags "Definition" d + ref refType format ident = inTags False refType + [ ("text:reference-format", format ), + ("text:ref-name", ident) ] + inlineSpace = selfClosingTag "text:s" [] + bookmarkRef = ref "text:bookmark-ref" + bookmarkRefNumber ident = bookmarkRef "number" ident mempty + bookmarkRefName ident = bookmarkRef "text" ident d + bookmarkRefNameNumber ident = bookmarkRefNumber ident <> inlineSpace <> bookmarkRefName ident + bookmarkRef' + | isEnabled Ext_xrefs_number o && isEnabled Ext_xrefs_name o = bookmarkRefNameNumber + | isEnabled Ext_xrefs_name o = bookmarkRefName + | otherwise = bookmarkRefNumber + sequenceRef = ref "text:sequence-ref" + sequenceRefNumber ident = sequenceRef "value" ident mempty + sequenceRefName ident = sequenceRef "caption" ident d + sequenceRefNameNumber ident = sequenceRefNumber ident <> inlineSpace <> sequenceRefName ident + sequenceRef' + | isEnabled Ext_xrefs_number o && isEnabled Ext_xrefs_name o = sequenceRefNameNumber + | isEnabled Ext_xrefs_name o = sequenceRefName + | otherwise = sequenceRefNumber + link = inTags False "text:a" [ ("xlink:type" , "simple") + , ("xlink:href" , s ) + , ("office:name", t ) + ] d' + linkOrReference = case maybeIdentAndType of + Just (ident, HeaderRef) -> bookmarkRef' ident + Just (ident, TableRef) -> sequenceRef' ident + Just (ident, ImageRef) -> sequenceRef' ident + _ -> link + in if isEnabled Ext_xrefs_name o || isEnabled Ext_xrefs_number o + then linkOrReference + else link + bulletListStyle :: PandocMonad m => Int -> OD m (Int,(Int,[Doc Text])) bulletListStyle l = do let doStyles i = inTags True "text:list-level-style-bullet" @@ -819,34 +871,33 @@ data TextStyle = Italic textStyleAttr :: Map.Map Text Text -> TextStyle -> Map.Map Text Text -textStyleAttr m s - | Italic <- s = Map.insert "fo:font-style" "italic" . - Map.insert "style:font-style-asian" "italic" . - Map.insert "style:font-style-complex" "italic" $ m - | Bold <- s = Map.insert "fo:font-weight" "bold" . - Map.insert "style:font-weight-asian" "bold" . - Map.insert "style:font-weight-complex" "bold" $ m - | Under <- s = Map.insert "style:text-underline-style" "solid" . - Map.insert "style:text-underline-width" "auto" . - Map.insert "style:text-underline-color" "font-color" $ m - | Strike <- s = Map.insert "style:text-line-through-style" "solid" m - | Sub <- s = Map.insert "style:text-position" "sub 58%" m - | Sup <- s = Map.insert "style:text-position" "super 58%" m - | SmallC <- s = Map.insert "fo:font-variant" "small-caps" m - | Pre <- s = Map.insert "style:font-name" "Courier New" . - Map.insert "style:font-name-asian" "Courier New" . - Map.insert "style:font-name-complex" "Courier New" $ m - | Language lang <- s - = Map.insert "fo:language" (langLanguage lang) . - Map.insert "fo:country" (langRegion lang) $ m - | otherwise = m +textStyleAttr m = \case + Italic -> Map.insert "fo:font-style" "italic" . + Map.insert "style:font-style-asian" "italic" . + Map.insert "style:font-style-complex" "italic" $ m + Bold -> Map.insert "fo:font-weight" "bold" . + Map.insert "style:font-weight-asian" "bold" . + Map.insert "style:font-weight-complex" "bold" $ m + Under -> Map.insert "style:text-underline-style" "solid" . + Map.insert "style:text-underline-width" "auto" . + Map.insert "style:text-underline-color" "font-color" $ m + Strike -> Map.insert "style:text-line-through-style" "solid" m + Sub -> Map.insert "style:text-position" "sub 58%" m + Sup -> Map.insert "style:text-position" "super 58%" m + SmallC -> Map.insert "fo:font-variant" "small-caps" m + Pre -> Map.insert "style:font-name" "Courier New" . + Map.insert "style:font-name-asian" "Courier New" . + Map.insert "style:font-name-complex" "Courier New" $ m + Language lang -> + Map.insert "fo:language" (langLanguage lang) . + maybe id (Map.insert "fo:country") (langRegion lang) $ m withLangFromAttr :: PandocMonad m => Attr -> OD m a -> OD m a withLangFromAttr (_,_,kvs) action = case lookup "lang" kvs of Nothing -> action Just l -> - case parseBCP47 l of + case parseLang l of Right lang -> withTextStyle (Language lang) action Left _ -> do report $ InvalidLang l diff --git a/src/Text/Pandoc/Writers/Org.hs b/src/Text/Pandoc/Writers/Org.hs index 2af93017d..d404f1c8d 100644 --- a/src/Text/Pandoc/Writers/Org.hs +++ b/src/Text/Pandoc/Writers/Org.hs @@ -3,8 +3,8 @@ {- | Module : Text.Pandoc.Writers.Org Copyright : © 2010-2015 Puneeth Chaganti <punchagan@gmail.com> - 2010-2020 John MacFarlane <jgm@berkeley.edu> - 2016-2020 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> + 2010-2021 John MacFarlane <jgm@berkeley.edu> + 2016-2021 Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de> @@ -17,8 +17,9 @@ Org-Mode: <http://orgmode.org> -} module Text.Pandoc.Writers.Org (writeOrg) where import Control.Monad.State.Strict -import Data.Char (isAlphaNum) +import Data.Char (isAlphaNum, isDigit) import Data.List (intersect, intersperse, partition, transpose) +import Data.List.NonEmpty (nonEmpty) import Data.Text (Text) import qualified Data.Text as T import Text.Pandoc.Class.PandocMonad (PandocMonad, report) @@ -83,12 +84,15 @@ noteToOrg num note = do -- | Escape special characters for Org. escapeString :: Text -> Text -escapeString = escapeStringUsing - [ ('\x2014',"---") - , ('\x2013',"--") - , ('\x2019',"'") - , ('\x2026',"...") - ] +escapeString t + | T.all (\c -> c < '\x2013' || c > '\x2026') t = t + | otherwise = T.concatMap escChar t + where + escChar '\x2013' = "--" + escChar '\x2014' = "---" + escChar '\x2019' = "'" + escChar '\x2026' = "..." + escChar c = T.singleton c isRawFormat :: Format -> Bool isRawFormat f = @@ -163,7 +167,7 @@ blockToOrg (Table _ blkCapt specs thead tbody tfoot) = do else "#+caption: " <> caption'' headers' <- mapM blockListToOrg headers rawRows <- mapM (mapM blockListToOrg) rows - let numChars = maximum . map offset + let numChars = maybe 0 maximum . nonEmpty . map offset -- FIXME: width is not being used. let widthsInChars = map numChars $ transpose (headers' : rawRows) @@ -198,7 +202,7 @@ blockToOrg (OrderedList (start, _, delim) items) = do x -> x let markers = take (length items) $ orderedListMarkers (start, Decimal, delim') - let maxMarkerLength = maximum $ map T.length markers + let maxMarkerLength = maybe 0 maximum . nonEmpty $ map T.length markers let markers' = map (\m -> let s = maxMarkerLength - T.length m in m <> T.replicate s " ") markers contents <- zipWithM orderedListItemToOrg markers' items @@ -213,25 +217,35 @@ blockToOrg (DefinitionList items) = do -- | Convert bullet list item (list of blocks) to Org. bulletListItemToOrg :: PandocMonad m => [Block] -> Org m (Doc Text) bulletListItemToOrg items = do - contents <- blockListToOrg items + exts <- gets $ writerExtensions . stOptions + contents <- blockListToOrg (taskListItemToOrg exts items) return $ hang 2 "- " contents $$ if endsWithPlain items then cr else blankline - -- | Convert ordered list item (a list of blocks) to Org. orderedListItemToOrg :: PandocMonad m => Text -- ^ marker for list item -> [Block] -- ^ list item (list of blocks) -> Org m (Doc Text) orderedListItemToOrg marker items = do - contents <- blockListToOrg items + exts <- gets $ writerExtensions . stOptions + contents <- blockListToOrg (taskListItemToOrg exts items) return $ hang (T.length marker + 1) (literal marker <> space) contents $$ if endsWithPlain items then cr else blankline +-- | Convert a list item containing text starting with @U+2610 BALLOT BOX@ +-- or @U+2612 BALLOT BOX WITH X@ to org checkbox syntax (e.g. @[X]@). +taskListItemToOrg :: Extensions -> [Block] -> [Block] +taskListItemToOrg = handleTaskListItem toOrg + where + toOrg (Str "☐" : Space : is) = Str "[ ]" : Space : is + toOrg (Str "☒" : Space : is) = Str "[X]" : Space : is + toOrg is = is + -- | Convert definition list item (label, list of blocks) to Org. definitionListItemToOrg :: PandocMonad m => ([Inline], [[Block]]) -> Org m (Doc Text) @@ -337,16 +351,20 @@ inlineListToOrg :: PandocMonad m => [Inline] -> Org m (Doc Text) inlineListToOrg lst = hcat <$> mapM inlineToOrg (fixMarkers lst) - where fixMarkers [] = [] -- prevent note refs and list markers from wrapping, see #4171 + where -- Prevent note refs and list markers from wrapping, see #4171 + -- and #7132. + fixMarkers [] = [] fixMarkers (Space : x : rest) | shouldFix x = Str " " : x : fixMarkers rest fixMarkers (SoftBreak : x : rest) | shouldFix x = Str " " : x : fixMarkers rest fixMarkers (x : rest) = x : fixMarkers rest - shouldFix Note{} = True -- Prevent footnotes + shouldFix Note{} = True -- Prevent footnotes shouldFix (Str "-") = True -- Prevent bullet list items - -- TODO: prevent ordered list items + shouldFix (Str x) -- Prevent ordered list items + | Just (cs, c) <- T.unsnoc x = T.all isDigit cs && + (c == '.' || c == ')') shouldFix _ = False -- | Convert Pandoc inline element to Org. @@ -386,9 +404,11 @@ inlineToOrg (Str str) = return . literal $ escapeString str inlineToOrg (Math t str) = do modify $ \st -> st{ stHasMath = True } return $ if t == InlineMath - then "$" <> literal str <> "$" - else "$$" <> literal str <> "$$" + then "\\(" <> literal str <> "\\)" + else "\\[" <> literal str <> "\\]" inlineToOrg il@(RawInline f str) + | elem f ["tex", "latex"] && T.isPrefixOf "\\begin" str = + return $ cr <> literal str <> cr | isRawFormat f = return $ literal str | otherwise = do report $ InlineNotRendered il diff --git a/src/Text/Pandoc/Writers/Powerpoint.hs b/src/Text/Pandoc/Writers/Powerpoint.hs index ca3b74a1d..e0573beca 100644 --- a/src/Text/Pandoc/Writers/Powerpoint.hs +++ b/src/Text/Pandoc/Writers/Powerpoint.hs @@ -39,5 +39,5 @@ writePowerpoint opts (Pandoc meta blks) = do let blks' = walk fixDisplayMath blks let (pres, logMsgs) = documentToPresentation opts (Pandoc meta blks') mapM_ report logMsgs - archv <- presentationToArchive opts pres + archv <- presentationToArchive opts meta pres return $ fromArchive archv diff --git a/src/Text/Pandoc/Writers/Powerpoint/Output.hs b/src/Text/Pandoc/Writers/Powerpoint/Output.hs index 603a84acc..157810216 100644 --- a/src/Text/Pandoc/Writers/Powerpoint/Output.hs +++ b/src/Text/Pandoc/Writers/Powerpoint/Output.hs @@ -20,15 +20,16 @@ import Control.Monad.Except (throwError, catchError) import Control.Monad.Reader import Control.Monad.State import Codec.Archive.Zip -import Data.Char (toUpper) import Data.List (intercalate, stripPrefix, nub, union, isPrefixOf, intersperse) import Data.Default +import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Read import Data.Time (formatTime, defaultTimeLocale) import Data.Time.Clock (UTCTime) import Data.Time.Clock.POSIX (utcTimeToPOSIXSeconds, posixSecondsToUTCTime) import System.FilePath.Posix (splitDirectories, splitExtension, takeExtension) -import Text.XML.Light +import Text.Pandoc.XML.Light as XML import Text.Pandoc.Definition import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Class.PandocMonad (PandocMonad) @@ -37,17 +38,21 @@ import qualified Text.Pandoc.Class.PandocMonad as P import Text.Pandoc.Options import Text.Pandoc.MIME import qualified Data.ByteString.Lazy as BL +import Text.Pandoc.Writers.Shared (metaToContext) import Text.Pandoc.Writers.OOXML import qualified Data.Map as M import Data.Maybe (mapMaybe, listToMaybe, fromMaybe, maybeToList, catMaybes, isJust) import Text.Pandoc.ImageSize import Control.Applicative ((<|>)) import System.FilePath.Glob -import Text.DocTemplates (FromContext(lookupContext)) +import Text.DocTemplates (FromContext(lookupContext), Context) +import Text.DocLayout (literal) import Text.TeXMath import Text.Pandoc.Writers.Math (convertMath) import Text.Pandoc.Writers.Powerpoint.Presentation +import Text.Pandoc.Shared (tshow, stringify) import Skylighting (fromColor) +import Data.List.NonEmpty (nonEmpty) -- |The 'EMU' type is used to specify sizes in English Metric Units. type EMU = Integer @@ -77,19 +82,24 @@ getPresentationSize :: Archive -> Archive -> Maybe (Integer, Integer) getPresentationSize refArchive distArchive = do entry <- findEntryByPath "ppt/presentation.xml" refArchive `mplus` findEntryByPath "ppt/presentation.xml" distArchive - presElement <- parseXMLDoc $ UTF8.toStringLazy $ fromEntry entry + presElement <- either (const Nothing) return $ + parseXMLElement $ UTF8.toTextLazy $ fromEntry entry let ns = elemToNameSpaces presElement sldSize <- findChild (elemName ns "p" "sldSz") presElement cxS <- findAttr (QName "cx" Nothing Nothing) sldSize cyS <- findAttr (QName "cy" Nothing Nothing) sldSize - (cx, _) <- listToMaybe $ reads cxS :: Maybe (Integer, String) - (cy, _) <- listToMaybe $ reads cyS :: Maybe (Integer, String) + cx <- readTextAsInteger cxS + cy <- readTextAsInteger cyS return (cx `div` 12700, cy `div` 12700) +readTextAsInteger :: Text -> Maybe Integer +readTextAsInteger = either (const Nothing) (Just . fst) . Data.Text.Read.decimal + data WriterEnv = WriterEnv { envRefArchive :: Archive , envDistArchive :: Archive , envUTCTime :: UTCTime , envOpts :: WriterOptions + , envContext :: Context Text , envPresentationSize :: (Integer, Integer) , envSlideHasHeader :: Bool , envInList :: Bool @@ -115,6 +125,7 @@ instance Default WriterEnv where , envDistArchive = emptyArchive , envUTCTime = posixSecondsToUTCTime 0 , envOpts = def + , envContext = mempty , envPresentationSize = (720, 540) , envSlideHasHeader = False , envInList = False @@ -159,20 +170,16 @@ runP env st p = evalStateT (runReaderT p env) st -------------------------------------------------------------------- -findAttrText :: QName -> Element -> Maybe T.Text -findAttrText n = fmap T.pack . findAttr n - monospaceFont :: Monad m => P m T.Text monospaceFont = do - vars <- writerVariables <$> asks envOpts + vars <- asks envContext case lookupContext "monofont" vars of Just s -> return s Nothing -> return "Courier" --- Kept as string for XML.Light -fontSizeAttributes :: Monad m => RunProps -> P m [(String, String)] +fontSizeAttributes :: Monad m => RunProps -> P m [(Text, Text)] fontSizeAttributes RunProps { rPropForceSize = Just sz } = - return [("sz", show $ sz * 100)] + return [("sz", tshow $ sz * 100)] fontSizeAttributes _ = return [] copyFileToArchive :: PandocMonad m => Archive -> FilePath -> P m Archive @@ -301,8 +308,9 @@ makeSpeakerNotesMap (Presentation _ slides) = then Nothing else Just n -presentationToArchive :: PandocMonad m => WriterOptions -> Presentation -> m Archive -presentationToArchive opts pres = do +presentationToArchive :: PandocMonad m + => WriterOptions -> Meta -> Presentation -> m Archive +presentationToArchive opts meta pres = do distArchive <- toArchive . BL.fromStrict <$> P.readDefaultDataFile "reference.pptx" refArchive <- case writerReferenceDoc opts of @@ -310,7 +318,7 @@ presentationToArchive opts pres = do Nothing -> toArchive . BL.fromStrict <$> P.readDataFile "reference.pptx" - utctime <- P.getCurrentTime + utctime <- P.getTimestamp presSize <- case getPresentationSize refArchive distArchive of Just sz -> return sz @@ -318,10 +326,18 @@ presentationToArchive opts pres = do PandocSomeError "Could not determine presentation size" + -- note, we need writerTemplate to be Just _ or metaToContext does + -- nothing + context <- metaToContext opts{ writerTemplate = + writerTemplate opts <|> Just mempty } + (return . literal . stringify) + (return . literal . stringify) meta + let env = def { envRefArchive = refArchive , envDistArchive = distArchive , envUTCTime = utctime , envOpts = opts + , envContext = context , envPresentationSize = presSize , envSlideIdMap = makeSlideIdMap pres , envSpeakerNotesIdMap = makeSpeakerNotesMap pres @@ -363,7 +379,7 @@ shapeHasId :: NameSpaces -> T.Text -> Element -> Bool shapeHasId ns ident element | Just nvSpPr <- findChild (elemName ns "p" "nvSpPr") element , Just cNvPr <- findChild (elemName ns "p" "cNvPr") nvSpPr - , Just nm <- findAttrText (QName "id" Nothing Nothing) cNvPr = + , Just nm <- findAttr (QName "id" Nothing Nothing) cNvPr = nm == ident | otherwise = False @@ -394,10 +410,10 @@ getShapeDimensions ns element ext <- findChild (elemName ns "a" "ext") xfrm cxS <- findAttr (QName "cx" Nothing Nothing) ext cyS <- findAttr (QName "cy" Nothing Nothing) ext - (x, _) <- listToMaybe $ reads xS - (y, _) <- listToMaybe $ reads yS - (cx, _) <- listToMaybe $ reads cxS - (cy, _) <- listToMaybe $ reads cyS + x <- readTextAsInteger xS + y <- readTextAsInteger yS + cx <- readTextAsInteger cxS + cy <- readTextAsInteger cyS return ((x `div` 12700, y `div` 12700), (cx `div` 12700, cy `div` 12700)) | otherwise = Nothing @@ -428,7 +444,7 @@ getContentShapeSize ns layout master Nothing -> do let mbSz = findChild (elemName ns "p" "nvSpPr") sp >>= findChild (elemName ns "p" "cNvPr") >>= - findAttrText (QName "id" Nothing Nothing) >>= + findAttr (QName "id" Nothing Nothing) >>= flip getMasterShapeDimensionsById master case mbSz of Just sz' -> return sz' @@ -437,10 +453,10 @@ getContentShapeSize ns layout master getContentShapeSize _ _ _ = throwError $ PandocSomeError "Attempted to find content shape size in non-layout" -buildSpTree :: NameSpaces -> Element -> [Element] -> Element +buildSpTree :: NameSpaces -> Element -> [Content] -> Element buildSpTree ns spTreeElem newShapes = emptySpTreeElem { elContent = newContent } - where newContent = elContent emptySpTreeElem <> map Elem newShapes + where newContent = elContent emptySpTreeElem <> newShapes emptySpTreeElem = spTreeElem { elContent = filter fn (elContent spTreeElem) } fn :: Content -> Bool fn (Elem e) = isElem ns "p" "nvGrpSpPr" e || @@ -448,8 +464,8 @@ buildSpTree ns spTreeElem newShapes = fn _ = True replaceNamedChildren :: NameSpaces - -> String - -> String + -> Text + -> Text -> [Element] -> Element -> Element @@ -472,15 +488,16 @@ registerLink link = do linkReg <- gets stLinkIds mediaReg <- gets stMediaIds hasSpeakerNotes <- curSlideHasSpeakerNotes - let maxLinkId = case M.lookup curSlideId linkReg of - Just mp -> case M.keys mp of - [] -> if hasSpeakerNotes then 2 else 1 - ks -> maximum ks - Nothing -> if hasSpeakerNotes then 2 else 1 - maxMediaId = case M.lookup curSlideId mediaReg of - Just [] -> if hasSpeakerNotes then 2 else 1 - Just mInfos -> maximum $ map mInfoLocalId mInfos - Nothing -> if hasSpeakerNotes then 2 else 1 + let maxLinkId = case M.lookup curSlideId linkReg >>= nonEmpty . M.keys of + Just xs -> maximum xs + Nothing + | hasSpeakerNotes -> 2 + | otherwise -> 1 + maxMediaId = case M.lookup curSlideId mediaReg >>= nonEmpty of + Just mInfos -> maximum $ fmap mInfoLocalId mInfos + Nothing + | hasSpeakerNotes -> 2 + | otherwise -> 1 maxId = max maxLinkId maxMediaId slideLinks = case M.lookup curSlideId linkReg of Just mp -> M.insert (maxId + 1) link mp @@ -495,20 +512,19 @@ registerMedia fp caption = do mediaReg <- gets stMediaIds globalIds <- gets stMediaGlobalIds hasSpeakerNotes <- curSlideHasSpeakerNotes - let maxLinkId = case M.lookup curSlideId linkReg of - Just mp -> case M.keys mp of - [] -> if hasSpeakerNotes then 2 else 1 - ks -> maximum ks - Nothing -> if hasSpeakerNotes then 2 else 1 - maxMediaId = case M.lookup curSlideId mediaReg of - Just [] -> if hasSpeakerNotes then 2 else 1 - Just mInfos -> maximum $ map mInfoLocalId mInfos - Nothing -> if hasSpeakerNotes then 2 else 1 + let maxLinkId = case M.lookup curSlideId linkReg >>= nonEmpty . M.keys of + Just ks -> maximum ks + Nothing + | hasSpeakerNotes -> 2 + | otherwise -> 1 + maxMediaId = case M.lookup curSlideId mediaReg >>= nonEmpty of + Just mInfos -> maximum $ fmap mInfoLocalId mInfos + Nothing + | hasSpeakerNotes -> 2 + | otherwise -> 1 maxLocalId = max maxLinkId maxMediaId - maxGlobalId = case M.elems globalIds of - [] -> 0 - ids -> maximum ids + maxGlobalId = maybe 0 maximum $ nonEmpty $ M.elems globalIds (imgBytes, mbMt) <- P.fetchItem $ T.pack fp let imgExt = (mbMt >>= extensionFromMimeType >>= (\x -> return $ "." <> x)) @@ -521,6 +537,7 @@ registerMedia fp caption = do Just Eps -> Just ".eps" Just Svg -> Just ".svg" Just Emf -> Just ".emf" + Just Tiff -> Just ".tiff" Nothing -> Nothing let newGlobalId = fromMaybe (maxGlobalId + 1) (M.lookup fp globalIds) @@ -652,10 +669,10 @@ createCaption contentShapeDimensions paraElements = do ] , mknode "p:spPr" [] [ mknode "a:xfrm" [] - [ mknode "a:off" [("x", show $ 12700 * x), - ("y", show $ 12700 * (y + cy - captionHeight))] () - , mknode "a:ext" [("cx", show $ 12700 * cx), - ("cy", show $ 12700 * captionHeight)] () + [ mknode "a:off" [("x", tshow $ 12700 * x), + ("y", tshow $ 12700 * (y + cy - captionHeight))] () + , mknode "a:ext" [("cx", tshow $ 12700 * cx), + ("cy", tshow $ 12700 * captionHeight)] () ] , mknode "a:prstGeom" [("prst", "rect")] [ mknode "a:avLst" [] () @@ -704,11 +721,13 @@ makePicElements layout picProps mInfo alt = do ,("noChangeAspect","1")] () -- cNvPr will contain the link information so we do that separately, -- and register the link if necessary. - let cNvPrAttr = [("descr", mInfoFilePath mInfo), ("id","0"),("name","Picture 1")] + let cNvPrAttr = [("descr", T.pack $ mInfoFilePath mInfo), + ("id","0"), + ("name","Picture 1")] cNvPr <- case picPropLink picProps of Just link -> do idNum <- registerLink link return $ mknode "p:cNvPr" cNvPrAttr $ - mknode "a:hlinkClick" [("r:id", "rId" <> show idNum)] () + mknode "a:hlinkClick" [("r:id", "rId" <> tshow idNum)] () Nothing -> return $ mknode "p:cNvPr" cNvPrAttr () let nvPicPr = mknode "p:nvPicPr" [] [ cNvPr @@ -716,13 +735,13 @@ makePicElements layout picProps mInfo alt = do , mknode "p:nvPr" [] ()] let blipFill = mknode "p:blipFill" [] [ mknode "a:blip" [("r:embed", "rId" <> - show (mInfoLocalId mInfo))] () + tshow (mInfoLocalId mInfo))] () , mknode "a:stretch" [] $ mknode "a:fillRect" [] () ] let xfrm = mknode "a:xfrm" [] - [ mknode "a:off" [("x",show xoff'), ("y",show yoff')] () - , mknode "a:ext" [("cx",show dimX') - ,("cy",show dimY')] () ] + [ mknode "a:off" [("x", tshow xoff'), ("y", tshow yoff')] () + , mknode "a:ext" [("cx", tshow dimX') + ,("cy", tshow dimY')] () ] let prstGeom = mknode "a:prstGeom" [("prst","rect")] $ mknode "a:avLst" [] () let ln = mknode "a:ln" [("w","9525")] @@ -744,8 +763,8 @@ makePicElements layout picProps mInfo alt = do else return [picShape] -paraElemToElements :: PandocMonad m => ParaElem -> P m [Element] -paraElemToElements Break = return [mknode "a:br" [] ()] +paraElemToElements :: PandocMonad m => ParaElem -> P m [Content] +paraElemToElements Break = return [Elem $ mknode "a:br" [] ()] paraElemToElements (Run rpr s) = do sizeAttrs <- fontSizeAttributes rpr let attrs = sizeAttrs <> @@ -761,7 +780,7 @@ paraElemToElements (Run rpr s) = do Just DoubleStrike -> [("strike", "dblStrike")] Nothing -> []) <> (case rBaseline rpr of - Just n -> [("baseline", show n)] + Just n -> [("baseline", tshow n)] Nothing -> []) <> (case rCap rpr of Just NoCapitals -> [("cap", "none")] @@ -778,42 +797,44 @@ paraElemToElements (Run rpr s) = do return $ case link of InternalTarget _ -> let linkAttrs = - [ ("r:id", "rId" <> show idNum) + [ ("r:id", "rId" <> tshow idNum) , ("action", "ppaction://hlinksldjump") ] in [mknode "a:hlinkClick" linkAttrs ()] -- external ExternalTarget _ -> let linkAttrs = - [ ("r:id", "rId" <> show idNum) + [ ("r:id", "rId" <> tshow idNum) ] in [mknode "a:hlinkClick" linkAttrs ()] Nothing -> return [] let colorContents = case rSolidFill rpr of Just color -> case fromColor color of - '#':hx -> [mknode "a:solidFill" [] - [mknode "a:srgbClr" [("val", map toUpper hx)] ()] - ] + '#':hx -> + [mknode "a:solidFill" [] + [mknode "a:srgbClr" + [("val", T.toUpper $ T.pack hx)] ()]] _ -> [] Nothing -> [] codeFont <- monospaceFont let codeContents = - [mknode "a:latin" [("typeface", T.unpack codeFont)] () | rPropCode rpr] + [mknode "a:latin" [("typeface", codeFont)] () | rPropCode rpr] let propContents = linkProps <> colorContents <> codeContents - return [mknode "a:r" [] [ mknode "a:rPr" attrs propContents - , mknode "a:t" [] $ T.unpack s - ]] + return [Elem $ mknode "a:r" [] [ mknode "a:rPr" attrs propContents + , mknode "a:t" [] s + ]] paraElemToElements (MathElem mathType texStr) = do isInSpkrNotes <- asks envInSpeakerNotes if isInSpkrNotes then paraElemToElements $ Run def $ unTeXString texStr else do res <- convertMath writeOMML mathType (unTeXString texStr) - case res of - Right r -> return [mknode "a14:m" [] $ addMathInfo r] + case fromXLElement <$> res of + Right r -> return [Elem $ mknode "a14:m" [] $ addMathInfo r] Left (Str s) -> paraElemToElements (Run def s) Left _ -> throwError $ PandocShouldNeverHappenError "non-string math fallback" -paraElemToElements (RawOOXMLParaElem str) = return [ x | Elem x <- parseXML str ] +paraElemToElements (RawOOXMLParaElem str) = return + [Text (CData CDataRaw str Nothing)] -- This is a bit of a kludge -- really requires adding an option to @@ -821,9 +842,10 @@ paraElemToElements (RawOOXMLParaElem str) = return [ x | Elem x <- parseXML str -- step at a time. addMathInfo :: Element -> Element addMathInfo element = - let mathspace = Attr { attrKey = QName "m" Nothing (Just "xmlns") - , attrVal = "http://schemas.openxmlformats.org/officeDocument/2006/math" - } + let mathspace = + Attr { attrKey = QName "m" Nothing (Just "xmlns") + , attrVal = "http://schemas.openxmlformats.org/officeDocument/2006/math" + } in add_attr mathspace element -- We look through the element to see if it contains an a14:m @@ -846,13 +868,13 @@ surroundWithMathAlternate element = paragraphToElement :: PandocMonad m => Paragraph -> P m Element paragraphToElement par = do let - attrs = [("lvl", show $ pPropLevel $ paraProps par)] <> + attrs = [("lvl", tshow $ pPropLevel $ paraProps par)] <> (case pPropMarginLeft (paraProps par) of - Just px -> [("marL", show $ pixelsToEmu px)] + Just px -> [("marL", tshow $ pixelsToEmu px)] Nothing -> [] ) <> (case pPropIndent (paraProps par) of - Just px -> [("indent", show $ pixelsToEmu px)] + Just px -> [("indent", tshow $ pixelsToEmu px)] Nothing -> [] ) <> (case pPropAlign (paraProps par) of @@ -864,7 +886,7 @@ paragraphToElement par = do props = [] <> (case pPropSpaceBefore $ paraProps par of Just px -> [mknode "a:spcBef" [] [ - mknode "a:spcPts" [("val", show $ 100 * px)] () + mknode "a:spcPts" [("val", tshow $ 100 * px)] () ] ] Nothing -> [] @@ -875,8 +897,9 @@ paragraphToElement par = do [mknode "a:buAutoNum" (autoNumAttrs attrs') ()] Nothing -> [mknode "a:buNone" [] ()] ) - paras <- concat <$> mapM paraElemToElements (paraElems par) - return $ mknode "a:p" [] $ [mknode "a:pPr" attrs props] <> paras + paras <- mapM paraElemToElements (paraElems par) + return $ mknode "a:p" [] $ + [Elem $ mknode "a:pPr" attrs props] <> concat paras shapeToElement :: PandocMonad m => Element -> Shape -> P m Element shapeToElement layout (TextBox paras) @@ -896,21 +919,22 @@ shapeToElement layout (TextBox paras) -- GraphicFrame and Pic should never reach this. shapeToElement _ _ = return $ mknode "p:sp" [] () -shapeToElements :: PandocMonad m => Element -> Shape -> P m [Element] +shapeToElements :: PandocMonad m => Element -> Shape -> P m [Content] shapeToElements layout (Pic picProps fp alt) = do mInfo <- registerMedia fp alt case mInfoExt mInfo of - Just _ -> + Just _ -> map Elem <$> makePicElements layout picProps mInfo alt Nothing -> shapeToElements layout $ TextBox [Paragraph def alt] -shapeToElements layout (GraphicFrame tbls cptn) = +shapeToElements layout (GraphicFrame tbls cptn) = map Elem <$> graphicFrameToElements layout tbls cptn -shapeToElements _ (RawOOXMLShape str) = return [ x | Elem x <- parseXML str ] +shapeToElements _ (RawOOXMLShape str) = return + [Text (CData CDataRaw str Nothing)] shapeToElements layout shp = do element <- shapeToElement layout shp - return [element] + return [Elem element] -shapesToElements :: PandocMonad m => Element -> [Shape] -> P m [Element] +shapesToElements :: PandocMonad m => Element -> [Shape] -> P m [Content] shapesToElements layout shps = concat <$> mapM (shapeToElements layout) shps @@ -937,8 +961,10 @@ graphicFrameToElements layout tbls caption = do [mknode "p:ph" [("idx", "1")] ()] ] , mknode "p:xfrm" [] - [ mknode "a:off" [("x", show $ 12700 * x), ("y", show $ 12700 * y)] () - , mknode "a:ext" [("cx", show $ 12700 * cx), ("cy", show $ 12700 * cy)] () + [ mknode "a:off" [("x", tshow $ 12700 * x), + ("y", tshow $ 12700 * y)] () + , mknode "a:ext" [("cx", tshow $ 12700 * cx), + ("cy", tshow $ 12700 * cy)] () ] ] <> elements @@ -952,7 +978,7 @@ getDefaultTableStyle = do refArchive <- asks envRefArchive distArchive <- asks envDistArchive tblStyleLst <- parseXml refArchive distArchive "ppt/tableStyles.xml" - return $ findAttrText (QName "def" Nothing Nothing) tblStyleLst + return $ findAttr (QName "def" Nothing Nothing) tblStyleLst graphicToElement :: PandocMonad m => Integer -> Graphic -> P m Element graphicToElement tableWidth (Tbl tblPr hdrCells rows) = do @@ -990,7 +1016,7 @@ graphicToElement tableWidth (Tbl tblPr hdrCells rows) = do let mkrow border cells = mknode "a:tr" [("h", "0")] $ map (mkcell border) cells let mkgridcol w = mknode "a:gridCol" - [("w", show ((12700 * w) :: Integer))] () + [("w", tshow ((12700 * w) :: Integer))] () let hasHeader = not (all null hdrCells) mbDefTblStyle <- getDefaultTableStyle @@ -999,7 +1025,7 @@ graphicToElement tableWidth (Tbl tblPr hdrCells rows) = do , ("bandRow", if tblPrBandRow tblPr then "1" else "0") ] (case mbDefTblStyle of Nothing -> [] - Just sty -> [mknode "a:tableStyleId" [] $ T.unpack sty]) + Just sty -> [mknode "a:tableStyleId" [] sty]) return $ mknode "a:graphic" [] [mknode "a:graphicData" [("uri", "http://schemas.openxmlformats.org/drawingml/2006/table")] @@ -1032,7 +1058,7 @@ findPHType ns spElem phType -- if it's a named PHType, we want to check that the attribute -- value matches. Just phElem | (PHType tp) <- phType -> - case findAttrText (QName "type" Nothing Nothing) phElem of + case findAttr (QName "type" Nothing Nothing) phElem of Just tp' -> tp == tp' Nothing -> False -- if it's an ObjType, we want to check that there is NO @@ -1083,7 +1109,7 @@ contentToElement layout hdrShape shapes , Just cSld <- findChild (elemName ns "p" "cSld") layout , Just spTree <- findChild (elemName ns "p" "spTree") cSld = do element <- nonBodyTextToElement layout [PHType "title"] hdrShape - let hdrShapeElements = [element | not (null hdrShape)] + let hdrShapeElements = [Elem element | not (null hdrShape)] contentElements <- local (\env -> env {envContentType = NormalContent}) (shapesToElements layout shapes) @@ -1096,7 +1122,7 @@ twoColumnToElement layout hdrShape shapesL shapesR , Just cSld <- findChild (elemName ns "p" "cSld") layout , Just spTree <- findChild (elemName ns "p" "spTree") cSld = do element <- nonBodyTextToElement layout [PHType "title"] hdrShape - let hdrShapeElements = [element | not (null hdrShape)] + let hdrShapeElements = [Elem element | not (null hdrShape)] contentElementsL <- local (\env -> env {envContentType =TwoColumnLeftContent}) (shapesToElements layout shapesL) @@ -1105,7 +1131,8 @@ twoColumnToElement layout hdrShape shapesL shapesR (shapesToElements layout shapesR) -- let contentElementsL' = map (setIdx ns "1") contentElementsL -- contentElementsR' = map (setIdx ns "2") contentElementsR - return $ buildSpTree ns spTree (hdrShapeElements <> contentElementsL <> contentElementsR) + return $ buildSpTree ns spTree $ + hdrShapeElements <> contentElementsL <> contentElementsR twoColumnToElement _ _ _ _= return $ mknode "p:sp" [] () @@ -1115,7 +1142,7 @@ titleToElement layout titleElems , Just cSld <- findChild (elemName ns "p" "cSld") layout , Just spTree <- findChild (elemName ns "p" "spTree") cSld = do element <- nonBodyTextToElement layout [PHType "title", PHType "ctrTitle"] titleElems - let titleShapeElements = [element | not (null titleElems)] + let titleShapeElements = [Elem element | not (null titleElems)] return $ buildSpTree ns spTree titleShapeElements titleToElement _ _ = return $ mknode "p:sp" [] () @@ -1135,7 +1162,8 @@ metadataToElement layout titleElems subtitleElems authorsElems dateElems dateShapeElements <- if null dateElems then return [] else sequence [nonBodyTextToElement layout [PHType "dt"] dateElems] - return $ buildSpTree ns spTree (titleShapeElements <> subtitleShapeElements <> dateShapeElements) + return . buildSpTree ns spTree . map Elem $ + (titleShapeElements <> subtitleShapeElements <> dateShapeElements) metadataToElement _ _ _ _ _ = return $ mknode "p:sp" [] () slideToElement :: PandocMonad m => Slide -> P m Element @@ -1197,7 +1225,7 @@ getSlideNumberFieldId notesMaster , Just txBody <- findChild (elemName ns "p" "txBody") sp , Just p <- findChild (elemName ns "a" "p") txBody , Just fld <- findChild (elemName ns "a" "fld") p - , Just fldId <- findAttrText (QName "id" Nothing Nothing) fld = + , Just fldId <- findAttr (QName "id" Nothing Nothing) fld = return fldId | otherwise = throwError $ PandocSomeError @@ -1276,11 +1304,11 @@ speakerNotesSlideNumber pgNum fieldId = [ mknode "a:bodyPr" [] () , mknode "a:lstStyle" [] () , mknode "a:p" [] - [ mknode "a:fld" [ ("id", T.unpack fieldId) + [ mknode "a:fld" [ ("id", fieldId) , ("type", "slidenum") ] [ mknode "a:rPr" [("lang", "en-US")] () - , mknode "a:t" [] (show pgNum) + , mknode "a:t" [] (tshow pgNum) ] , mknode "a:endParaRPr" [("lang", "en-US")] () ] @@ -1332,7 +1360,7 @@ getSlideIdNum sldId = do Just n -> return n Nothing -> throwError $ PandocShouldNeverHappenError $ - "Slide Id " <> T.pack (show sldId) <> " not found." + "Slide Id " <> tshow sldId <> " not found." slideNum :: PandocMonad m => Slide -> P m Int slideNum slide = getSlideIdNum $ slideId slide @@ -1349,7 +1377,7 @@ slideToRelId :: PandocMonad m => Slide -> P m T.Text slideToRelId slide = do n <- slideNum slide offset <- asks envSlideIdOffset - return $ "rId" <> T.pack (show $ n + offset) + return $ "rId" <> tshow (n + offset) data Relationship = Relationship { relId :: Int @@ -1361,13 +1389,11 @@ elementToRel :: Element -> Maybe Relationship elementToRel element | elName element == QName "Relationship" (Just "http://schemas.openxmlformats.org/package/2006/relationships") Nothing = do rId <- findAttr (QName "Id" Nothing Nothing) element - numStr <- stripPrefix "rId" rId - num <- case reads numStr :: [(Int, String)] of - (n, _) : _ -> Just n - [] -> Nothing - type' <- findAttrText (QName "Type" Nothing Nothing) element + numStr <- T.stripPrefix "rId" rId + num <- fromIntegral <$> readTextAsInteger numStr + type' <- findAttr (QName "Type" Nothing Nothing) element target <- findAttr (QName "Target" Nothing Nothing) element - return $ Relationship num type' target + return $ Relationship num type' (T.unpack target) | otherwise = Nothing slideToPresRel :: PandocMonad m => Slide -> P m Relationship @@ -1416,11 +1442,8 @@ presentationToRels pres@(Presentation _ slides) = do -- all relWithoutSlide rels (unless they're 1) -- 3. If we have a notesmaster slide, we make space for that as well. - let minRelNotOne = case filter (1<) $ map relId relsWeKeep of - [] -> 0 -- doesn't matter in this case, since - -- there will be nothing to map the - -- function over - l -> minimum l + let minRelNotOne = maybe 0 minimum $ nonEmpty + $ filter (1 <) $ map relId relsWeKeep modifyRelNum :: Int -> Int modifyRelNum 1 = 1 @@ -1456,10 +1479,9 @@ topLevelRelsEntry :: PandocMonad m => P m Entry topLevelRelsEntry = elemToEntry "_rels/.rels" $ relsToElement topLevelRels relToElement :: Relationship -> Element -relToElement rel = mknode "Relationship" [ ("Id", "rId" <> - show (relId rel)) - , ("Type", T.unpack $ relType rel) - , ("Target", relTarget rel) ] () +relToElement rel = mknode "Relationship" [ ("Id", "rId" <> tshow (relId rel)) + , ("Type", relType rel) + , ("Target", T.pack $ relTarget rel) ] () relsToElement :: [Relationship] -> Element relsToElement rels = mknode "Relationships" @@ -1494,7 +1516,8 @@ slideToSpeakerNotesEntry slide = do Just element | Just notesIdNum <- mbNotesIdNum -> Just <$> elemToEntry - ("ppt/notesSlides/notesSlide" <> show notesIdNum <> ".xml") + ("ppt/notesSlides/notesSlide" <> show notesIdNum <> + ".xml") element _ -> return Nothing @@ -1507,7 +1530,7 @@ slideToSpeakerNotesRelElement slide@Slide{} = do [("xmlns", "http://schemas.openxmlformats.org/package/2006/relationships")] [ mknode "Relationship" [ ("Id", "rId2") , ("Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide") - , ("Target", "../slides/slide" <> show idNum <> ".xml") + , ("Target", "../slides/slide" <> tshow idNum <> ".xml") ] () , mknode "Relationship" [ ("Id", "rId1") , ("Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesMaster") @@ -1540,15 +1563,15 @@ linkRelElement :: PandocMonad m => (Int, LinkTarget) -> P m Element linkRelElement (rIdNum, InternalTarget targetId) = do targetIdNum <- getSlideIdNum targetId return $ - mknode "Relationship" [ ("Id", "rId" <> show rIdNum) + mknode "Relationship" [ ("Id", "rId" <> tshow rIdNum) , ("Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide") - , ("Target", "slide" <> show targetIdNum <> ".xml") + , ("Target", "slide" <> tshow targetIdNum <> ".xml") ] () linkRelElement (rIdNum, ExternalTarget (url, _)) = return $ - mknode "Relationship" [ ("Id", "rId" <> show rIdNum) + mknode "Relationship" [ ("Id", "rId" <> tshow rIdNum) , ("Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink") - , ("Target", T.unpack url) + , ("Target", url) , ("TargetMode", "External") ] () @@ -1560,10 +1583,10 @@ mediaRelElement mInfo = let ext = fromMaybe "" (mInfoExt mInfo) in mknode "Relationship" [ ("Id", "rId" <> - show (mInfoLocalId mInfo)) + tshow (mInfoLocalId mInfo)) , ("Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image") , ("Target", "../media/image" <> - show (mInfoGlobalId mInfo) <> T.unpack ext) + tshow (mInfoGlobalId mInfo) <> ext) ] () speakerNotesSlideRelElement :: PandocMonad m => Slide -> P m (Maybe Element) @@ -1573,7 +1596,7 @@ speakerNotesSlideRelElement slide = do return $ case M.lookup idNum mp of Nothing -> Nothing Just n -> - let target = "../notesSlides/notesSlide" <> show n <> ".xml" + let target = "../notesSlides/notesSlide" <> tshow n <> ".xml" in Just $ mknode "Relationship" [ ("Id", "rId2") , ("Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide") @@ -1612,9 +1635,9 @@ slideToSlideRelElement slide = do slideToSldIdElement :: PandocMonad m => Slide -> P m Element slideToSldIdElement slide = do n <- slideNum slide - let id' = show $ n + 255 + let id' = tshow $ n + 255 rId <- slideToRelId slide - return $ mknode "p:sldId" [("id", id'), ("r:id", T.unpack rId)] () + return $ mknode "p:sldId" [("id", id'), ("r:id", rId)] () presentationToSldIdLst :: PandocMonad m => Presentation -> P m Element presentationToSldIdLst (Presentation _ slides) = do @@ -1639,7 +1662,7 @@ presentationToPresentationElement pres@(Presentation _ slds) = do notesMasterElem = mknode "p:notesMasterIdLst" [] [ mknode "p:NotesMasterId" - [("r:id", "rId" <> show notesMasterRId)] + [("r:id", "rId" <> tshow notesMasterRId)] () ] @@ -1695,17 +1718,17 @@ docPropsElement docProps = do ,("xmlns:dcmitype","http://purl.org/dc/dcmitype/") ,("xmlns:xsi","http://www.w3.org/2001/XMLSchema-instance")] $ - mknode "dc:title" [] (maybe "" T.unpack $ dcTitle docProps) + mknode "dc:title" [] (fromMaybe "" $ dcTitle docProps) : - mknode "dc:creator" [] (maybe "" T.unpack $ dcCreator docProps) + mknode "dc:creator" [] (fromMaybe "" $ dcCreator docProps) : - mknode "cp:keywords" [] (T.unpack keywords) - : ( [mknode "dc:subject" [] $ maybe "" T.unpack $ dcSubject docProps | isJust (dcSubject docProps)]) - <> ( [mknode "dc:description" [] $ maybe "" T.unpack $ dcDescription docProps | isJust (dcDescription docProps)]) - <> ( [mknode "cp:category" [] $ maybe "" T.unpack $ cpCategory docProps | isJust (cpCategory docProps)]) + mknode "cp:keywords" [] keywords + : ( [mknode "dc:subject" [] $ fromMaybe "" $ dcSubject docProps | isJust (dcSubject docProps)]) + <> ( [mknode "dc:description" [] $ fromMaybe "" $ dcDescription docProps | isJust (dcDescription docProps)]) + <> ( [mknode "cp:category" [] $ fromMaybe "" $ cpCategory docProps | isJust (cpCategory docProps)]) <> (\x -> [ mknode "dcterms:created" [("xsi:type","dcterms:W3CDTF")] x - , mknode "dcterms:modified" [("xsi:type","dcterms:W3CDTF")] x - ]) (formatTime defaultTimeLocale "%FT%XZ" utctime) + , mknode "dcterms:modified" [("xsi:type","dcterms:W3CDTF")] x + ]) (T.pack $ formatTime defaultTimeLocale "%FT%XZ" utctime) docPropsToEntry :: PandocMonad m => DocProps -> P m Entry docPropsToEntry docProps = docPropsElement docProps >>= @@ -1716,8 +1739,8 @@ docCustomPropsElement :: PandocMonad m => DocProps -> P m Element docCustomPropsElement docProps = do let mkCustomProp (k, v) pid = mknode "property" [("fmtid","{D5CDD505-2E9C-101B-9397-08002B2CF9AE}") - ,("pid", show pid) - ,("name", T.unpack k)] $ mknode "vt:lpwstr" [] (T.unpack v) + ,("pid", tshow pid) + ,("name", k)] $ mknode "vt:lpwstr" [] v return $ mknode "Properties" [("xmlns","http://schemas.openxmlformats.org/officeDocument/2006/custom-properties") ,("xmlns:vt","http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes") @@ -1736,7 +1759,7 @@ viewPropsElement = do distArchive <- asks envDistArchive viewPrElement <- parseXml refArchive distArchive "ppt/viewProps.xml" -- remove "lastView" if it exists: - let notLastView :: Text.XML.Light.Attr -> Bool + let notLastView :: XML.Attr -> Bool notLastView attr = qName (attrKey attr) /= "lastView" return $ @@ -1748,15 +1771,15 @@ makeViewPropsEntry = viewPropsElement >>= elemToEntry "ppt/viewProps.xml" defaultContentTypeToElem :: DefaultContentType -> Element defaultContentTypeToElem dct = mknode "Default" - [("Extension", T.unpack $ defContentTypesExt dct), - ("ContentType", T.unpack $ defContentTypesType dct)] + [("Extension", defContentTypesExt dct), + ("ContentType", defContentTypesType dct)] () overrideContentTypeToElem :: OverrideContentType -> Element overrideContentTypeToElem oct = mknode "Override" - [("PartName", overrideContentTypesPart oct), - ("ContentType", T.unpack $ overrideContentTypesType oct)] + [("PartName", T.pack $ overrideContentTypesPart oct), + ("ContentType", overrideContentTypesType oct)] () contentTypesToElement :: ContentTypes -> Element @@ -1814,7 +1837,8 @@ getSpeakerNotesFilePaths :: PandocMonad m => P m [FilePath] getSpeakerNotesFilePaths = do mp <- asks envSpeakerNotesIdMap let notesIdNums = M.elems mp - return $ map (\n -> "ppt/notesSlides/notesSlide" <> show n <> ".xml") notesIdNums + return $ map (\n -> "ppt/notesSlides/notesSlide" <> show n <> ".xml") + notesIdNums presentationToContentTypes :: PandocMonad m => Presentation -> P m ContentTypes presentationToContentTypes p@(Presentation _ slides) = do @@ -1878,11 +1902,11 @@ getContentType fp | otherwise = Nothing -- Kept as String for XML.Light -autoNumAttrs :: ListAttributes -> [(String, String)] +autoNumAttrs :: ListAttributes -> [(Text, Text)] autoNumAttrs (startNum, numStyle, numDelim) = numAttr <> typeAttr where - numAttr = [("startAt", show startNum) | startNum /= 1] + numAttr = [("startAt", tshow startNum) | startNum /= 1] typeAttr = [("type", typeString <> delimString)] typeString = case numStyle of Decimal -> "arabic" diff --git a/src/Text/Pandoc/Writers/Powerpoint/Presentation.hs b/src/Text/Pandoc/Writers/Powerpoint/Presentation.hs index affec38aa..9246a93e9 100644 --- a/src/Text/Pandoc/Writers/Powerpoint/Presentation.hs +++ b/src/Text/Pandoc/Writers/Powerpoint/Presentation.hs @@ -45,6 +45,7 @@ module Text.Pandoc.Writers.Powerpoint.Presentation ( documentToPresentation import Control.Monad.Reader import Control.Monad.State import Data.List (intercalate) +import Data.List.NonEmpty (nonEmpty) import Data.Default import Text.Pandoc.Definition import Text.Pandoc.ImageSize @@ -363,9 +364,7 @@ inlineToParElems (Note blks) = do then return [] else do notes <- gets stNoteIds - let maxNoteId = case M.keys notes of - [] -> 0 - lst -> maximum lst + let maxNoteId = maybe 0 maximum $ nonEmpty $ M.keys notes curNoteId = maxNoteId + 1 modify $ \st -> st { stNoteIds = M.insert curNoteId blks notes } local (\env -> env{envRunProps = (envRunProps env){rLink = Just $ InternalTarget endNotesSlideId}}) $ diff --git a/src/Text/Pandoc/Writers/RST.hs b/src/Text/Pandoc/Writers/RST.hs index 43bf382b7..983ef412a 100644 --- a/src/Text/Pandoc/Writers/RST.hs +++ b/src/Text/Pandoc/Writers/RST.hs @@ -2,7 +2,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Writers.RST - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -16,7 +16,8 @@ reStructuredText: <http://docutils.sourceforge.net/rst.html> module Text.Pandoc.Writers.RST ( writeRST, flatten ) where import Control.Monad.State.Strict import Data.Char (isSpace) -import Data.List (transpose, intersperse) +import Data.List (transpose, intersperse, foldl') +import qualified Data.List.NonEmpty as NE import Data.Maybe (fromMaybe) import qualified Data.Text as T import Data.Text (Text) @@ -143,9 +144,12 @@ pictToRST (label, (attr, src, _, mbtarget)) = do let (_, cls, _) = attr classes = case cls of [] -> empty - ["align-right"] -> ":align: right" - ["align-left"] -> ":align: left" - ["align-center"] -> ":align: center" + ["align-top"] -> ":align: top" + ["align-middle"] -> ":align: middle" + ["align-bottom"] -> ":align: bottom" + ["align-center"] -> empty + ["align-right"] -> empty + ["align-left"] -> empty _ -> ":class: " <> literal (T.unwords cls) return $ nowrap $ ".. |" <> label' <> "| image:: " <> literal src $$ hang 3 empty (classes $$ dims) @@ -215,19 +219,28 @@ blockToRST (Div (ident,classes,_kvs) bs) = do nest 3 contents $$ blankline blockToRST (Plain inlines) = inlineListToRST inlines --- title beginning with fig: indicates that the image is a figure -blockToRST (Para [Image attr txt (src,T.stripPrefix "fig:" -> Just tit)]) = do - capt <- inlineListToRST txt +blockToRST (Para [Image attr txt (src, rawtit)]) = do + description <- inlineListToRST txt dims <- imageDimsToRST attr - let fig = "figure:: " <> literal src - alt = ":alt: " <> if T.null tit then capt else literal tit + -- title beginning with fig: indicates that the image is a figure + let (isfig, tit) = case T.stripPrefix "fig:" rawtit of + Nothing -> (False, rawtit) + Just tit' -> (True, tit') + let fig | isfig = "figure:: " <> literal src + | otherwise = "image:: " <> literal src + alt | isfig = ":alt: " <> if T.null tit then description else literal tit + | null txt = empty + | otherwise = ":alt: " <> description + capt | isfig = description + | otherwise = empty (_,cls,_) = attr classes = case cls of [] -> empty ["align-right"] -> ":align: right" ["align-left"] -> ":align: left" ["align-center"] -> ":align: center" - _ -> ":figclass: " <> literal (T.unwords cls) + _ | isfig -> ":figclass: " <> literal (T.unwords cls) + | otherwise -> ":class: " <> literal (T.unwords cls) return $ hang 3 ".. " (fig $$ alt $$ classes $$ dims $+$ capt) $$ blankline blockToRST (Para inlines) | LineBreak `elem` inlines = @@ -323,7 +336,7 @@ blockToRST (OrderedList (start, style', delim) items) = do then replicate (length items) "#." else take (length items) $ orderedListMarkers (start, style', delim) - let maxMarkerLength = maximum $ map T.length markers + let maxMarkerLength = maybe 0 maximum $ NE.nonEmpty $ map T.length markers let markers' = map (\m -> let s = maxMarkerLength - T.length m in m <> T.replicate s " ") markers contents <- zipWithM orderedListItemToRST markers' items @@ -497,7 +510,7 @@ flatten outer | null contents = [outer] | otherwise = combineAll contents where contents = dropInlineParent outer - combineAll = foldl combine [] + combineAll = foldl' combine [] combine :: [Inline] -> Inline -> [Inline] combine f i = @@ -507,8 +520,8 @@ flatten outer (Quoted _ _, _) -> keep f i (_, Quoted _ _) -> keep f i -- spans are not rendered using RST inlines, so we can keep them - (Span ("",[],[]) _, _) -> keep f i - (_, Span ("",[],[]) _) -> keep f i + (Span (_,_,[]) _, _) -> keep f i + (_, Span (_,_,[]) _) -> keep f i -- inlineToRST handles this case properly so it's safe to keep ( Link{}, Image{}) -> keep f i -- parent inlines would prevent links from being correctly @@ -525,11 +538,15 @@ flatten outer collapse f i = appendToLast f $ dropInlineParent i appendToLast :: [Inline] -> [Inline] -> [Inline] - appendToLast [] toAppend = [setInlineChildren outer toAppend] - appendToLast flattened toAppend - | isOuter lastFlat = init flattened <> [appendTo lastFlat toAppend] - | otherwise = flattened <> [setInlineChildren outer toAppend] - where lastFlat = last flattened + appendToLast flattened toAppend = + case NE.nonEmpty flattened of + Nothing -> [setInlineChildren outer toAppend] + Just xs -> + if isOuter lastFlat + then NE.init xs <> [appendTo lastFlat toAppend] + else flattened <> [setInlineChildren outer toAppend] + where + lastFlat = NE.last xs appendTo o i = mapNested (<> i) o isOuter i = emptyParent i == emptyParent outer emptyParent i = setInlineChildren i [] @@ -749,8 +766,7 @@ simpleTable opts blocksToDoc headers rows = do then return [] else fixEmpties <$> mapM (blocksToDoc opts) headers rowDocs <- mapM (fmap fixEmpties . mapM (blocksToDoc opts)) rows - let numChars [] = 0 - numChars xs = maximum . map offset $ xs + let numChars = maybe 0 maximum . NE.nonEmpty . map offset let colWidths = map numChars $ transpose (headerDocs : rowDocs) let toRow = mconcat . intersperse (lblock 1 " ") . zipWith lblock colWidths let hline = nowrap $ hsep (map (\n -> literal (T.replicate n "=")) colWidths) diff --git a/src/Text/Pandoc/Writers/RTF.hs b/src/Text/Pandoc/Writers/RTF.hs index e3966ed07..3527949b4 100644 --- a/src/Text/Pandoc/Writers/RTF.hs +++ b/src/Text/Pandoc/Writers/RTF.hs @@ -2,7 +2,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Writers.RTF - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -16,7 +16,7 @@ module Text.Pandoc.Writers.RTF ( writeRTF import Control.Monad.Except (catchError, throwError) import Control.Monad import qualified Data.ByteString as B -import Data.Char (chr, isDigit, ord) +import Data.Char (chr, isDigit, ord, isAlphaNum) import qualified Data.Map as M import Data.Text (Text) import qualified Data.Text as T @@ -137,15 +137,21 @@ handleUnicode = T.concatMap $ \c -> -- | Escape special characters. escapeSpecial :: Text -> Text -escapeSpecial = escapeStringUsing $ - [ ('\t',"\\tab ") - , ('\8216',"\\u8216'") - , ('\8217',"\\u8217'") - , ('\8220',"\\u8220\"") - , ('\8221',"\\u8221\"") - , ('\8211',"\\u8211-") - , ('\8212',"\\u8212-") - ] <> backslashEscapes "{\\}" +escapeSpecial t + | T.all isAlphaNum t = t + | otherwise = T.concatMap escChar t + where + escChar '\t' = "\\tab " + escChar '\8216' = "\\u8216'" + escChar '\8217' = "\\u8217'" + escChar '\8220' = "\\u8220\"" + escChar '\8221' = "\\u8221\"" + escChar '\8211' = "\\u8211-" + escChar '\8212' = "\\u8212-" + escChar '{' = "\\{" + escChar '}' = "\\}" + escChar '\\' = "\\\\" + escChar c = T.singleton c -- | Escape strings as needed for rich text format. stringToRTF :: Text -> Text diff --git a/src/Text/Pandoc/Writers/Roff.hs b/src/Text/Pandoc/Writers/Roff.hs index 00b027cc9..6af56242f 100644 --- a/src/Text/Pandoc/Writers/Roff.hs +++ b/src/Text/Pandoc/Writers/Roff.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Writers.Roff - Copyright : Copyright (C) 2007-2020 John MacFarlane + Copyright : Copyright (C) 2007-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Writers/Shared.hs b/src/Text/Pandoc/Writers/Shared.hs index b399afbf3..0b7c6bee0 100644 --- a/src/Text/Pandoc/Writers/Shared.hs +++ b/src/Text/Pandoc/Writers/Shared.hs @@ -3,7 +3,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Writers.Shared - Copyright : Copyright (C) 2013-2020 John MacFarlane + Copyright : Copyright (C) 2013-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -20,6 +20,7 @@ module Text.Pandoc.Writers.Shared ( , setField , resetField , defField + , getLang , tagWithAttrs , isDisplayMath , fixDisplayMath @@ -44,6 +45,7 @@ import Control.Monad (zipWithM) import Data.Aeson (ToJSON (..), encode) import Data.Char (chr, ord, isSpace) import Data.List (groupBy, intersperse, transpose, foldl') +import Data.List.NonEmpty (NonEmpty(..), nonEmpty) import Data.Text.Conversions (FromText(..)) import qualified Data.Map as M import qualified Data.Text as T @@ -109,8 +111,7 @@ metaValueToVal blockWriter inlineWriter (MetaMap metamap) = MapVal . Context <$> mapM (metaValueToVal blockWriter inlineWriter) metamap metaValueToVal blockWriter inlineWriter (MetaList xs) = ListVal <$> mapM (metaValueToVal blockWriter inlineWriter) xs -metaValueToVal _ _ (MetaBool True) = return $ SimpleVal "true" -metaValueToVal _ _ (MetaBool False) = return NullVal +metaValueToVal _ _ (MetaBool b) = return $ BoolVal b metaValueToVal _ inlineWriter (MetaString s) = SimpleVal <$> inlineWriter (Builder.toList (Builder.text s)) metaValueToVal blockWriter _ (MetaBlocks bs) = SimpleVal <$> blockWriter bs @@ -147,6 +148,19 @@ defField field val (Context m) = where f _newval oldval = oldval +-- | Get the contents of the `lang` metadata field or variable. +getLang :: WriterOptions -> Meta -> Maybe T.Text +getLang opts meta = + case lookupContext "lang" (writerVariables opts) of + Just s -> Just s + _ -> + case lookupMeta "lang" meta of + Just (MetaBlocks [Para [Str s]]) -> Just s + Just (MetaBlocks [Plain [Str s]]) -> Just s + Just (MetaInlines [Str s]) -> Just s + Just (MetaString s) -> Just s + _ -> Nothing + -- | Produce an HTML tag with the given pandoc attributes. tagWithAttrs :: HasChars a => T.Text -> Attr -> Doc a tagWithAttrs tag (ident,classes,kvs) = hsep @@ -225,7 +239,7 @@ gridTable :: (Monad m, HasChars a) -> m (Doc a) gridTable opts blocksToDoc headless aligns widths headers rows = do -- the number of columns will be used in case of even widths - let numcols = maximum (length aligns : length widths : + let numcols = maximum (length aligns :| length widths : map length (headers:rows)) let officialWidthsInChars widths' = map ( (\x -> if x < 1 then 1 else x) . @@ -254,8 +268,7 @@ gridTable opts blocksToDoc headless aligns widths headers rows = do let handleFullWidths widths' = do rawHeaders' <- mapM (blocksToDoc opts) headers rawRows' <- mapM (mapM (blocksToDoc opts)) rows - let numChars [] = 0 - numChars xs = maximum . map offset $ xs + let numChars = maybe 0 maximum . nonEmpty . map offset let minWidthsInChars = map numChars $ transpose (rawHeaders' : rawRows') let widthsInChars' = zipWith max @@ -381,6 +394,7 @@ toSuperscript '2' = Just '\x00B2' toSuperscript '3' = Just '\x00B3' toSuperscript '+' = Just '\x207A' toSuperscript '-' = Just '\x207B' +toSuperscript '\x2212' = Just '\x207B' -- unicode minus toSuperscript '=' = Just '\x207C' toSuperscript '(' = Just '\x207D' toSuperscript ')' = Just '\x207E' diff --git a/src/Text/Pandoc/Writers/TEI.hs b/src/Text/Pandoc/Writers/TEI.hs index a9ee5eece..18015259d 100644 --- a/src/Text/Pandoc/Writers/TEI.hs +++ b/src/Text/Pandoc/Writers/TEI.hs @@ -2,7 +2,7 @@ {-# LANGUAGE PatternGuards #-} {- | Module : Text.Pandoc.Writers.Docbook - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -146,16 +146,17 @@ blockToTEI opts (LineBlock lns) = blockToTEI opts $ linesToPara lns blockToTEI opts (BlockQuote blocks) = inTagsIndented "quote" <$> blocksToTEI opts blocks -blockToTEI _ (CodeBlock (_,classes,_) str) = +blockToTEI opts (CodeBlock (_,classes,_) str) = return $ literal ("<ab type='codeblock " <> lang <> "'>") <> cr <> flush (literal (escapeStringForXML str) <> cr <> text "</ab>") where lang = if null langs then "" else escapeStringForXML (head langs) - isLang l = T.toLower l `elem` map T.toLower languages + syntaxMap = writerSyntaxMap opts + isLang l = T.toLower l `elem` map T.toLower (languages syntaxMap) langsFrom s = if isLang s then [s] - else languagesByExtension . T.toLower $ s + else (languagesByExtension syntaxMap) . T.toLower $ s langs = concatMap langsFrom classes blockToTEI opts (BulletList lst) = do let attribs = [("type", "unordered")] diff --git a/src/Text/Pandoc/Writers/Texinfo.hs b/src/Text/Pandoc/Writers/Texinfo.hs index c6debd9ce..6a33b4283 100644 --- a/src/Text/Pandoc/Writers/Texinfo.hs +++ b/src/Text/Pandoc/Writers/Texinfo.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Writers.Texinfo - Copyright : Copyright (C) 2008-2020 John MacFarlane + Copyright : Copyright (C) 2008-2021 John MacFarlane 2012 Peter Wang License : GNU GPL, version 2 or above @@ -14,8 +14,9 @@ Conversion of 'Pandoc' format into Texinfo. module Text.Pandoc.Writers.Texinfo ( writeTexinfo ) where import Control.Monad.Except (throwError) import Control.Monad.State.Strict -import Data.Char (chr, ord) -import Data.List (maximumBy, transpose) +import Data.Char (chr, ord, isAlphaNum) +import Data.List (maximumBy, transpose, foldl') +import Data.List.NonEmpty (nonEmpty) import Data.Ord (comparing) import qualified Data.Set as Set import Data.Text (Text) @@ -84,16 +85,18 @@ pandocToTexinfo options (Pandoc meta blocks) = do -- | Escape things as needed for Texinfo. stringToTexinfo :: Text -> Text -stringToTexinfo = escapeStringUsing texinfoEscapes - where texinfoEscapes = [ ('{', "@{") - , ('}', "@}") - , ('@', "@@") - , ('\160', "@ ") - , ('\x2014', "---") - , ('\x2013', "--") - , ('\x2026', "@dots{}") - , ('\x2019', "'") - ] +stringToTexinfo t + | T.all isAlphaNum t = t + | otherwise = T.concatMap escChar t + where escChar '{' = "@{" + escChar '}' = "@}" + escChar '@' = "@@" + escChar '\160' = "@ " + escChar '\x2014' = "---" + escChar '\x2013' = "--" + escChar '\x2026' = "@dots{}" + escChar '\x2019' = "'" + escChar c = T.singleton c escapeCommas :: PandocMonad m => TI m (Doc Text) -> TI m (Doc Text) escapeCommas parser = do @@ -238,9 +241,13 @@ blockToTexinfo (Table _ blkCapt specs thead tbody tfoot) = do colDescriptors <- if all (== 0) widths then do -- use longest entry instead of column widths - cols <- mapM (mapM (liftM (T.unpack . render Nothing . hcat) . mapM blockToTexinfo)) $ + cols <- mapM (mapM (fmap (T.unpack . render Nothing . hcat) . + mapM blockToTexinfo)) $ transpose $ heads : rows - return $ concatMap ((\x -> "{"++x++"} ") . maximumBy (comparing length)) cols + return $ concatMap + ((\x -> "{"++x++"} ") . + maybe "" (maximumBy (comparing length)) . nonEmpty) + cols else return $ "@columnfractions " ++ concatMap (printf "%.2f ") widths let tableBody = text ("@multitable " ++ colDescriptors) $$ headers $$ @@ -271,7 +278,7 @@ tableAnyRowToTexinfo :: PandocMonad m -> [[Block]] -> TI m (Doc Text) tableAnyRowToTexinfo itemtype aligns cols = - (literal itemtype $$) . foldl (\row item -> row $$ + (literal itemtype $$) . foldl' (\row item -> row $$ (if isEmpty row then empty else text " @tab ") <> item) empty <$> zipWithM alignedBlock aligns cols alignedBlock :: PandocMonad m diff --git a/src/Text/Pandoc/Writers/Textile.hs b/src/Text/Pandoc/Writers/Textile.hs index 61ddb7497..03d030477 100644 --- a/src/Text/Pandoc/Writers/Textile.hs +++ b/src/Text/Pandoc/Writers/Textile.hs @@ -2,7 +2,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Writers.Textile - Copyright : Copyright (C) 2010-2020 John MacFarlane + Copyright : Copyright (C) 2010-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Writers/ZimWiki.hs b/src/Text/Pandoc/Writers/ZimWiki.hs index 902b093d3..df914f590 100644 --- a/src/Text/Pandoc/Writers/ZimWiki.hs +++ b/src/Text/Pandoc/Writers/ZimWiki.hs @@ -2,7 +2,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Writers.ZimWiki - Copyright : © 2008-2020 John MacFarlane, + Copyright : © 2008-2021 John MacFarlane, 2017-2019 Alex Ivkin License : GNU GPL, version 2 or above @@ -20,6 +20,7 @@ import Control.Monad (zipWithM) import Control.Monad.State.Strict (StateT, evalStateT, gets, modify) import Data.Default (Default (..)) import Data.List (transpose) +import Data.List.NonEmpty (nonEmpty) import qualified Data.Map as Map import Text.DocLayout (render, literal) import Data.Maybe (fromMaybe) @@ -115,7 +116,7 @@ blockToZimWiki opts b@(RawBlock f str) blockToZimWiki _ HorizontalRule = return "\n----\n" blockToZimWiki opts (Header level _ inlines) = do - contents <- inlineListToZimWiki opts $ removeFormatting inlines -- emphasis, links etc. not allowed in headers + contents <- inlineListToZimWiki opts inlines let eqs = T.replicate ( 7 - level ) "=" return $ eqs <> " " <> contents <> " " <> eqs <> "\n" @@ -143,7 +144,8 @@ blockToZimWiki opts (Table _ blkCapt specs thead tbody tfoot) = do then zipWithM (tableItemToZimWiki opts) aligns (head rows) else mapM (inlineListToZimWiki opts . removeFormatting)headers -- emphasis, links etc. are not allowed in table headers rows' <- mapM (zipWithM (tableItemToZimWiki opts) aligns) rows - let widths = map (maximum . map T.length) $ transpose (headers':rows') + let widths = map (maybe 0 maximum . nonEmpty . map T.length) $ + transpose (headers':rows') let padTo (width, al) s = case width - T.length s of x | x > 0 -> diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs index 4b71d7b69..79b4768ec 100644 --- a/src/Text/Pandoc/XML.hs +++ b/src/Text/Pandoc/XML.hs @@ -1,9 +1,8 @@ {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.XML - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -14,6 +13,7 @@ Functions for escaping and formatting XML. -} module Text.Pandoc.XML ( escapeCharForXML, escapeStringForXML, + escapeNCName, inTags, selfClosingTag, inTagsSimple, @@ -25,7 +25,7 @@ module Text.Pandoc.XML ( escapeCharForXML, html5Attributes, rdfaAttributes ) where -import Data.Char (isAscii, isSpace, ord) +import Data.Char (isAscii, isSpace, ord, isLetter, isDigit) import Data.Text (Text) import qualified Data.Text as T import Text.HTML.TagSoup.Entity (lookupEntity, htmlEntities) @@ -120,26 +120,48 @@ html5EntityMap = foldr go mempty htmlEntities where ent' = T.takeWhile (/=';') (T.pack ent) _ -> entmap +-- | Converts a string into an NCName, i.e., an XML name without colons. +-- Disallowed characters are escaped using @ux%x@, where @%x@ is the +-- hexadecimal unicode identifier of the escaped character. +escapeNCName :: Text -> Text +escapeNCName t = case T.uncons t of + Nothing -> T.empty + Just (c, cs) -> escapeStartChar c <> T.concatMap escapeNCNameChar cs + where + escapeStartChar :: Char -> Text + escapeStartChar c = if isLetter c || c == '_' + then T.singleton c + else escapeChar c --- Unescapes XML entities + escapeNCNameChar :: Char -> Text + escapeNCNameChar c = if isNCNameChar c + then T.singleton c + else escapeChar c + + isNCNameChar :: Char -> Bool + isNCNameChar c = isLetter c || c `elem` ("_-.·" :: String) || isDigit c + || '\x0300' <= c && c <= '\x036f' + || '\x203f' <= c && c <= '\x2040' + + escapeChar :: Char -> Text + escapeChar = T.pack . printf "U%04X" . ord + +-- | Unescapes XML entities fromEntities :: Text -> Text -fromEntities = T.pack . fromEntities' +fromEntities t + = let (x, y) = T.break (== '&') t + in if T.null y + then t + else x <> + let (ent, rest) = T.break (\c -> isSpace c || c == ';') y + rest' = case T.uncons rest of + Just (';',ys) -> ys + _ -> rest + ent' = T.drop 1 ent <> ";" + in case T.pack <$> lookupEntity (T.unpack ent') of + Just c -> c <> fromEntities rest' + Nothing -> ent <> fromEntities rest -fromEntities' :: Text -> String -fromEntities' (T.uncons -> Just ('&', xs)) = - case lookupEntity $ T.unpack ent' of - Just c -> c <> fromEntities' rest - Nothing -> "&" <> fromEntities' xs - where (ent, rest) = case T.break (\c -> isSpace c || c == ';') xs of - (zs,T.uncons -> Just (';',ys)) -> (zs,ys) - (zs, ys) -> (zs,ys) - ent' - | Just ys <- T.stripPrefix "#X" ent = "#x" <> ys -- workaround tagsoup bug - | Just ('#', _) <- T.uncons ent = ent - | otherwise = ent <> ";" -fromEntities' t = case T.uncons t of - Just (x, xs) -> x : fromEntities' xs - Nothing -> "" html5Attributes :: Set.Set Text html5Attributes = Set.fromList diff --git a/src/Text/Pandoc/XML/Light.hs b/src/Text/Pandoc/XML/Light.hs new file mode 100644 index 000000000..07113ea92 --- /dev/null +++ b/src/Text/Pandoc/XML/Light.hs @@ -0,0 +1,89 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.XML.Light + Copyright : Copyright (C) 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +xml-light, which we used in pandoc's the XML-based readers, has +some limitations: in particular, it produces nodes with String +instead of Text, and the parser falls over on processing instructions +(see #7091). + +This module exports much of the API of xml-light, but using Text instead +of String. In addition, the xml-light parsers are replaced by xml-conduit's +well-tested parser. (The xml-conduit types are mapped to types +isomorphic to xml-light's, to avoid the need for massive code modifications +elsewhere.) Bridge functions to map xml-light types to this module's +types are also provided (since libraries like texmath still use xml-light). + +Another advantage of the xml-conduit parser is that it gives us +detailed information on xml parse errors. + +In the future we may want to move to using xml-conduit or another +xml library in the code base, but this change gives us +better performance and accuracy without much change in the +code that used xml-light. +-} +module Text.Pandoc.XML.Light + ( module Text.Pandoc.XML.Light.Types + , module Text.Pandoc.XML.Light.Proc + , module Text.Pandoc.XML.Light.Output + -- * Replacement for xml-light's Text.XML.Input + , parseXMLElement + , parseXMLContents + ) where + +import qualified Control.Exception as E +import qualified Text.XML as Conduit +import Text.XML.Unresolved (InvalidEventStream(..)) +import qualified Data.Text as T +import qualified Data.Text.Lazy as TL +import qualified Data.Map as M +import Data.Maybe (mapMaybe) +import Text.Pandoc.XML.Light.Types +import Text.Pandoc.XML.Light.Proc +import Text.Pandoc.XML.Light.Output + +-- Drop in replacement for parseXMLDoc in xml-light. +parseXMLElement :: TL.Text -> Either T.Text Element +parseXMLElement t = + elementToElement . Conduit.documentRoot <$> + either (Left . T.pack . E.displayException) Right + (Conduit.parseText Conduit.def{ Conduit.psRetainNamespaces = True } t) + +parseXMLContents :: TL.Text -> Either T.Text [Content] +parseXMLContents t = + case Conduit.parseText Conduit.def{ Conduit.psRetainNamespaces = True } t of + Left e -> + case E.fromException e of + Just (ContentAfterRoot _) -> + elContent <$> parseXMLElement ("<wrapper>" <> t <> "</wrapper>") + _ -> Left . T.pack . E.displayException $ e + Right x -> Right [Elem . elementToElement . Conduit.documentRoot $ x] + +elementToElement :: Conduit.Element -> Element +elementToElement (Conduit.Element name attribMap nodes) = + Element (nameToQname name) attrs (mapMaybe nodeToContent nodes) Nothing + where + attrs = map (\(n,v) -> Attr (nameToQname n) v) $ + M.toList attribMap + nameToQname (Conduit.Name localName mbns mbpref) = + case mbpref of + Nothing -> + case T.stripPrefix "xmlns:" localName of + Just rest -> QName rest mbns (Just "xmlns") + Nothing -> QName localName mbns mbpref + _ -> QName localName mbns mbpref + +nodeToContent :: Conduit.Node -> Maybe Content +nodeToContent (Conduit.NodeElement el) = + Just (Elem (elementToElement el)) +nodeToContent (Conduit.NodeContent t) = + Just (Text (CData CDataText t Nothing)) +nodeToContent _ = Nothing + diff --git a/src/Text/Pandoc/XML/Light/Output.hs b/src/Text/Pandoc/XML/Light/Output.hs new file mode 100644 index 000000000..8182ef2ec --- /dev/null +++ b/src/Text/Pandoc/XML/Light/Output.hs @@ -0,0 +1,234 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.XML.Light.Output + Copyright : Copyright (C) 2007 Galois, Inc., 2021 John MacFarlane + License : GNU GPL, version 2 or above + + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + + This code is based on code from xml-light, released under the BSD3 license. + We use a text Builder instead of ShowS. +-} +module Text.Pandoc.XML.Light.Output + ( -- * Replacement for xml-light's Text.XML.Output + ppTopElement + , ppElement + , ppContent + , ppcElement + , ppcContent + , showTopElement + , showElement + , showContent + , useShortEmptyTags + , defaultConfigPP + , ConfigPP(..) + ) where + +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Lazy as TL +import Data.Text.Lazy.Builder (Builder, singleton, fromText, toLazyText) +import Text.Pandoc.XML.Light.Types + +-- +-- duplicates functinos from Text.XML.Output +-- + +-- | The XML 1.0 header +xmlHeader :: Text +xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + + +-------------------------------------------------------------------------------- +data ConfigPP = ConfigPP + { shortEmptyTag :: QName -> Bool + , prettify :: Bool + } + +-- | Default pretty orinting configuration. +-- * Always use abbreviate empty tags. +defaultConfigPP :: ConfigPP +defaultConfigPP = ConfigPP { shortEmptyTag = const True + , prettify = False + } + +-- | The predicate specifies for which empty tags we should use XML's +-- abbreviated notation <TAG />. This is useful if we are working with +-- some XML-ish standards (such as certain versions of HTML) where some +-- empty tags should always be displayed in the <TAG></TAG> form. +useShortEmptyTags :: (QName -> Bool) -> ConfigPP -> ConfigPP +useShortEmptyTags p c = c { shortEmptyTag = p } + + +-- | Specify if we should use extra white-space to make document more readable. +-- WARNING: This adds additional white-space to text elements, +-- and so it may change the meaning of the document. +useExtraWhiteSpace :: Bool -> ConfigPP -> ConfigPP +useExtraWhiteSpace p c = c { prettify = p } + +-- | A configuration that tries to make things pretty +-- (possibly at the cost of changing the semantics a bit +-- through adding white space.) +prettyConfigPP :: ConfigPP +prettyConfigPP = useExtraWhiteSpace True defaultConfigPP + + +-------------------------------------------------------------------------------- + + +-- | Pretty printing renders XML documents faithfully, +-- with the exception that whitespace may be added\/removed +-- in non-verbatim character data. +ppTopElement :: Element -> Text +ppTopElement = ppcTopElement prettyConfigPP + +-- | Pretty printing elements +ppElement :: Element -> Text +ppElement = ppcElement prettyConfigPP + +-- | Pretty printing content +ppContent :: Content -> Text +ppContent = ppcContent prettyConfigPP + +-- | Pretty printing renders XML documents faithfully, +-- with the exception that whitespace may be added\/removed +-- in non-verbatim character data. +ppcTopElement :: ConfigPP -> Element -> Text +ppcTopElement c e = T.unlines [xmlHeader,ppcElement c e] + +-- | Pretty printing elements +ppcElement :: ConfigPP -> Element -> Text +ppcElement c = TL.toStrict . toLazyText . ppElementS c mempty + +-- | Pretty printing content +ppcContent :: ConfigPP -> Content -> Text +ppcContent c = TL.toStrict . toLazyText . ppContentS c mempty + +ppcCData :: ConfigPP -> CData -> Text +ppcCData c = TL.toStrict . toLazyText . ppCDataS c mempty + +type Indent = Builder + +-- | Pretty printing content using ShowT +ppContentS :: ConfigPP -> Indent -> Content -> Builder +ppContentS c i x = case x of + Elem e -> ppElementS c i e + Text t -> ppCDataS c i t + CRef r -> showCRefS r + +ppElementS :: ConfigPP -> Indent -> Element -> Builder +ppElementS c i e = i <> tagStart (elName e) (elAttribs e) <> + (case elContent e of + [] | "?" `T.isPrefixOf` qName name -> fromText " ?>" + | shortEmptyTag c name -> fromText " />" + [Text t] -> singleton '>' <> ppCDataS c mempty t <> tagEnd name + cs -> singleton '>' <> nl <> + mconcat (map ((<> nl) . ppContentS c (sp <> i)) cs) <> + i <> tagEnd name + where (nl,sp) = if prettify c then ("\n"," ") else ("","") + ) + where name = elName e + +ppCDataS :: ConfigPP -> Indent -> CData -> Builder +ppCDataS c i t = i <> if cdVerbatim t /= CDataText || not (prettify c) + then showCDataS t + else foldr cons mempty (T.unpack (showCData t)) + where cons :: Char -> Builder -> Builder + cons '\n' ys = singleton '\n' <> i <> ys + cons y ys = singleton y <> ys + + + +-------------------------------------------------------------------------------- + +-- | Adds the <?xml?> header. +showTopElement :: Element -> Text +showTopElement c = xmlHeader <> showElement c + +showContent :: Content -> Text +showContent = ppcContent defaultConfigPP + +showElement :: Element -> Text +showElement = ppcElement defaultConfigPP + +showCData :: CData -> Text +showCData = ppcCData defaultConfigPP + +-- Note: crefs should not contain '&', ';', etc. +showCRefS :: Text -> Builder +showCRefS r = singleton '&' <> fromText r <> singleton ';' + +-- | Convert a text element to characters. +showCDataS :: CData -> Builder +showCDataS cd = + case cdVerbatim cd of + CDataText -> escStr (cdData cd) + CDataVerbatim -> fromText "<![CDATA[" <> escCData (cdData cd) <> + fromText "]]>" + CDataRaw -> fromText (cdData cd) + +-------------------------------------------------------------------------------- +escCData :: Text -> Builder +escCData t + | "]]>" `T.isPrefixOf` t = + fromText "]]]]><![CDATA[>" <> fromText (T.drop 3 t) +escCData t + = case T.uncons t of + Nothing -> mempty + Just (c,t') -> singleton c <> escCData t' + +escChar :: Char -> Builder +escChar c = case c of + '<' -> fromText "<" + '>' -> fromText ">" + '&' -> fromText "&" + '"' -> fromText """ + -- we use ' instead of ' because IE apparently has difficulties + -- rendering ' in xhtml. + -- Reported by Rohan Drape <rohan.drape@gmail.com>. + '\'' -> fromText "'" + _ -> singleton c + + {- original xml-light version: + -- NOTE: We escape '\r' explicitly because otherwise they get lost + -- when parsed back in because of then end-of-line normalization rules. + _ | isPrint c || c == '\n' -> singleton c + | otherwise -> showText "&#" . showsT oc . singleton ';' + where oc = ord c + -} + +escStr :: Text -> Builder +escStr cs = if T.any needsEscape cs + then mconcat (map escChar (T.unpack cs)) + else fromText cs + where + needsEscape '<' = True + needsEscape '>' = True + needsEscape '&' = True + needsEscape '"' = True + needsEscape '\'' = True + needsEscape _ = False + +tagEnd :: QName -> Builder +tagEnd qn = fromText "</" <> showQName qn <> singleton '>' + +tagStart :: QName -> [Attr] -> Builder +tagStart qn as = singleton '<' <> showQName qn <> as_str + where as_str = if null as + then mempty + else mconcat (map showAttr as) + +showAttr :: Attr -> Builder +showAttr (Attr qn v) = singleton ' ' <> showQName qn <> + singleton '=' <> + singleton '"' <> escStr v <> singleton '"' + +showQName :: QName -> Builder +showQName q = + case qPrefix q of + Nothing -> fromText (qName q) + Just p -> fromText p <> singleton ':' <> fromText (qName q) diff --git a/src/Text/Pandoc/XML/Light/Proc.hs b/src/Text/Pandoc/XML/Light/Proc.hs new file mode 100644 index 000000000..a1fb200ff --- /dev/null +++ b/src/Text/Pandoc/XML/Light/Proc.hs @@ -0,0 +1,139 @@ +{-# LANGUAGE FlexibleInstances #-} +{- | + Module : Text.Pandoc.XML.Light.Proc + Copyright : Copyright (C) 2007 Galois, Inc., 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + + This code is taken from xml-light, released under the BSD3 license. +-} +module Text.Pandoc.XML.Light.Proc + ( + -- * Replacement for xml-light's Text.XML.Proc + strContent + , onlyElems + , elChildren + , onlyText + , findChildren + , filterChildren + , filterChildrenName + , findChild + , filterChild + , filterChildName + , findElement + , filterElement + , filterElementName + , findElements + , filterElements + , filterElementsName + , findAttr + , lookupAttr + , lookupAttrBy + , findAttrBy + ) where + +import Data.Text (Text) +import Data.Maybe (listToMaybe) +import Data.List(find) +import Text.Pandoc.XML.Light.Types + +-- +-- copied from xml-light Text.XML.Proc +-- + +-- | Get the text value of an XML element. This function +-- ignores non-text elements, and concatenates all text elements. +strContent :: Element -> Text +strContent = mconcat . map cdData . onlyText . elContent + +-- | Select only the elements from a list of XML content. +onlyElems :: [Content] -> [Element] +onlyElems xs = [ x | Elem x <- xs ] + +-- | Select only the elements from a parent. +elChildren :: Element -> [Element] +elChildren e = [ x | Elem x <- elContent e ] + +-- | Select only the text from a list of XML content. +onlyText :: [Content] -> [CData] +onlyText xs = [ x | Text x <- xs ] + +-- | Find all immediate children with the given name. +findChildren :: QName -> Element -> [Element] +findChildren q e = filterChildren ((q ==) . elName) e + +-- | Filter all immediate children wrt a given predicate. +filterChildren :: (Element -> Bool) -> Element -> [Element] +filterChildren p e = filter p (onlyElems (elContent e)) + + +-- | Filter all immediate children wrt a given predicate over their names. +filterChildrenName :: (QName -> Bool) -> Element -> [Element] +filterChildrenName p e = filter (p.elName) (onlyElems (elContent e)) + + +-- | Find an immediate child with the given name. +findChild :: QName -> Element -> Maybe Element +findChild q e = listToMaybe (findChildren q e) + +-- | Find an immediate child with the given name. +filterChild :: (Element -> Bool) -> Element -> Maybe Element +filterChild p e = listToMaybe (filterChildren p e) + +-- | Find an immediate child with name matching a predicate. +filterChildName :: (QName -> Bool) -> Element -> Maybe Element +filterChildName p e = listToMaybe (filterChildrenName p e) + +-- | Find the left-most occurrence of an element matching given name. +findElement :: QName -> Element -> Maybe Element +findElement q e = listToMaybe (findElements q e) + +-- | Filter the left-most occurrence of an element wrt. given predicate. +filterElement :: (Element -> Bool) -> Element -> Maybe Element +filterElement p e = listToMaybe (filterElements p e) + +-- | Filter the left-most occurrence of an element wrt. given predicate. +filterElementName :: (QName -> Bool) -> Element -> Maybe Element +filterElementName p e = listToMaybe (filterElementsName p e) + +-- | Find all non-nested occurances of an element. +-- (i.e., once we have found an element, we do not search +-- for more occurances among the element's children). +findElements :: QName -> Element -> [Element] +findElements qn e = filterElementsName (qn==) e + +-- | Find all non-nested occurrences of an element wrt. given predicate. +-- (i.e., once we have found an element, we do not search +-- for more occurances among the element's children). +filterElements :: (Element -> Bool) -> Element -> [Element] +filterElements p e + | p e = [e] + | otherwise = concatMap (filterElements p) $ onlyElems $ elContent e + +-- | Find all non-nested occurences of an element wrt a predicate over element names. +-- (i.e., once we have found an element, we do not search +-- for more occurances among the element's children). +filterElementsName :: (QName -> Bool) -> Element -> [Element] +filterElementsName p e = filterElements (p.elName) e + +-- | Lookup the value of an attribute. +findAttr :: QName -> Element -> Maybe Text +findAttr x e = lookupAttr x (elAttribs e) + +-- | Lookup attribute name from list. +lookupAttr :: QName -> [Attr] -> Maybe Text +lookupAttr x = lookupAttrBy (x ==) + +-- | Lookup the first attribute whose name satisfies the given predicate. +lookupAttrBy :: (QName -> Bool) -> [Attr] -> Maybe Text +lookupAttrBy p as = attrVal `fmap` find (p . attrKey) as + +-- | Lookup the value of the first attribute whose name +-- satisfies the given predicate. +findAttrBy :: (QName -> Bool) -> Element -> Maybe Text +findAttrBy p e = lookupAttrBy p (elAttribs e) + + diff --git a/src/Text/Pandoc/XML/Light/Types.hs b/src/Text/Pandoc/XML/Light/Types.hs new file mode 100644 index 000000000..ba602ac1f --- /dev/null +++ b/src/Text/Pandoc/XML/Light/Types.hs @@ -0,0 +1,193 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE DeriveDataTypeable #-} +{- | + Module : Text.Pandoc.XML.Light.Types + Copyright : Copyright (C) 2007 Galois, Inc., 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + + This code is taken from xml-light, released under the BSD3 license. + It has been modified to use Text instead of String, and the fromXL* + functions have been added. +-} +module Text.Pandoc.XML.Light.Types + ( -- * Basic types, duplicating those from xml-light but with Text + -- instead of String + Line + , Content(..) + , Element(..) + , Attr(..) + , CData(..) + , CDataKind(..) + , QName(..) + , Node(..) + , unode + , unqual + , add_attr + , add_attrs + -- * Conversion functions from xml-light types + , fromXLQName + , fromXLCData + , fromXLElement + , fromXLAttr + , fromXLContent + ) where + +import Data.Text (Text) +import qualified Data.Text as T +import Data.Data (Data) +import Data.Typeable (Typeable) +import qualified Text.XML.Light as XL + +-- +-- type definitions lightly modified from xml-light +-- + +-- | A line is an Integer +type Line = Integer + +-- | XML content +data Content = Elem Element + | Text CData + | CRef Text + deriving (Show, Typeable, Data, Ord, Eq) + +-- | XML elements +data Element = Element { + elName :: QName, + elAttribs :: [Attr], + elContent :: [Content], + elLine :: Maybe Line + } deriving (Show, Typeable, Data, Ord, Eq) + +-- | XML attributes +data Attr = Attr { + attrKey :: QName, + attrVal :: Text + } deriving (Eq, Ord, Show, Typeable, Data) + +-- | XML CData +data CData = CData { + cdVerbatim :: CDataKind, + cdData :: Text, + cdLine :: Maybe Line + } deriving (Show, Typeable, Data, Ord, Eq) + +data CDataKind + = CDataText -- ^ Ordinary character data; pretty printer escapes &, < etc. + | CDataVerbatim -- ^ Unescaped character data; pretty printer embeds it in <![CDATA[.. + | CDataRaw -- ^ As-is character data; pretty printer passes it along without any escaping or CDATA wrap-up. + deriving ( Eq, Ord, Show, Typeable, Data ) + +-- | XML qualified names +data QName = QName { + qName :: Text, + qURI :: Maybe Text, + qPrefix :: Maybe Text + } deriving (Show, Typeable, Data) + + +instance Eq QName where + q1 == q2 = compare q1 q2 == EQ + +instance Ord QName where + compare q1 q2 = + case compare (qName q1) (qName q2) of + EQ -> case (qURI q1, qURI q2) of + (Nothing,Nothing) -> compare (qPrefix q1) (qPrefix q2) + (u1,u2) -> compare u1 u2 + x -> x + +class Node t where + node :: QName -> t -> Element + +instance Node ([Attr],[Content]) where + node n (attrs,cont) = Element { elName = n + , elAttribs = attrs + , elContent = cont + , elLine = Nothing + } + +instance Node [Attr] where node n as = node n (as,[]::[Content]) +instance Node Attr where node n a = node n [a] +instance Node () where node n () = node n ([]::[Attr]) + +instance Node [Content] where node n cs = node n ([]::[Attr],cs) +instance Node Content where node n c = node n [c] +instance Node ([Attr],Content) where node n (as,c) = node n (as,[c]) +instance Node (Attr,Content) where node n (a,c) = node n ([a],[c]) + +instance Node ([Attr],[Element]) where + node n (as,cs) = node n (as,map Elem cs) + +instance Node ([Attr],Element) where node n (as,c) = node n (as,[c]) +instance Node (Attr,Element) where node n (a,c) = node n ([a],c) +instance Node [Element] where node n es = node n ([]::[Attr],es) +instance Node Element where node n e = node n [e] + +instance Node ([Attr],[CData]) where + node n (as,cs) = node n (as,map Text cs) + +instance Node ([Attr],CData) where node n (as,c) = node n (as,[c]) +instance Node (Attr,CData) where node n (a,c) = node n ([a],c) +instance Node [CData] where node n es = node n ([]::[Attr],es) +instance Node CData where node n e = node n [e] + +instance Node ([Attr],Text) where + node n (as,t) = node n (as, CData { cdVerbatim = CDataText + , cdData = t + , cdLine = Nothing }) + +instance Node (Attr,Text ) where node n (a,t) = node n ([a],t) +instance Node Text where node n t = node n ([]::[Attr],t) + +-- | Create node with unqualified name +unode :: Node t => Text -> t -> Element +unode = node . unqual + +unqual :: Text -> QName +unqual x = QName x Nothing Nothing + +-- | Add an attribute to an element. +add_attr :: Attr -> Element -> Element +add_attr a e = add_attrs [a] e + +-- | Add some attributes to an element. +add_attrs :: [Attr] -> Element -> Element +add_attrs as e = e { elAttribs = as ++ elAttribs e } + +-- +-- conversion from xml-light +-- + +fromXLQName :: XL.QName -> QName +fromXLQName qn = QName { qName = T.pack $ XL.qName qn + , qURI = T.pack <$> XL.qURI qn + , qPrefix = T.pack <$> XL.qPrefix qn } + +fromXLCData :: XL.CData -> CData +fromXLCData cd = CData { cdVerbatim = case XL.cdVerbatim cd of + XL.CDataText -> CDataText + XL.CDataVerbatim -> CDataVerbatim + XL.CDataRaw -> CDataRaw + , cdData = T.pack $ XL.cdData cd + , cdLine = XL.cdLine cd } + +fromXLElement :: XL.Element -> Element +fromXLElement el = Element { elName = fromXLQName $ XL.elName el + , elAttribs = map fromXLAttr $ XL.elAttribs el + , elContent = map fromXLContent $ XL.elContent el + , elLine = XL.elLine el } + +fromXLAttr :: XL.Attr -> Attr +fromXLAttr (XL.Attr qn s) = Attr (fromXLQName qn) (T.pack s) + +fromXLContent :: XL.Content -> Content +fromXLContent (XL.Elem el) = Elem $ fromXLElement el +fromXLContent (XL.Text cd) = Text $ fromXLCData cd +fromXLContent (XL.CRef s) = CRef (T.pack s) + + |