diff options
author | John MacFarlane <jgm@berkeley.edu> | 2018-09-30 22:32:00 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2018-09-30 22:32:00 -0700 |
commit | 36f1846cc3130dbe4168789cc03f916ebf5828c8 (patch) | |
tree | 8b28a2616f8948fcdb0e8d42a176234e9e3784aa /src/Text | |
parent | 0a8d212a097267cafc4cd5a64691b8e85aadb5c3 (diff) | |
download | pandoc-36f1846cc3130dbe4168789cc03f916ebf5828c8.tar.gz |
Implement `--ascii` (`writerPreferAscii`) in writers, not App.
Now the `write*` functions for Docbook, HTML, ICML, JATS,
Man, Ms, OPML are sensitive to `writerPreferAscii`. Previously
the to-ascii translation was done in Text.Pandoc.App, and
thus not available to those using the writer functions
directly.
In addition, the LaTeX writer is now sensitive to
`writerPreferAscii` and to `--ascii`. 100% ASCII
output can't be guaranteed, but the writer will use
commands like `\"{a}` and `\l` whenever possible,
to avoid emiting a non-ASCII character.
A new unexported module, Text.Pandoc.Groff, has been
added to store functions used in the different groff-based
writers.
Diffstat (limited to 'src/Text')
-rw-r--r-- | src/Text/Pandoc/App.hs | 25 | ||||
-rw-r--r-- | src/Text/Pandoc/Groff.hs | 43 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/Docbook.hs | 7 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/HTML.hs | 18 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/ICML.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/JATS.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/LaTeX.hs | 144 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/Man.hs | 4 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/Ms.hs | 4 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/OPML.hs | 3 |
10 files changed, 180 insertions, 74 deletions
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs index cb1db4f89..79d83c0d3 100644 --- a/src/Text/Pandoc/App.hs +++ b/src/Text/Pandoc/App.hs @@ -52,7 +52,7 @@ import Data.Aeson (defaultOptions) import Data.Aeson.TH (deriveJSON) import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as B -import Data.Char (toLower, toUpper, isAscii, ord) +import Data.Char (toLower, toUpper) import Data.List (find, intercalate, isPrefixOf, isSuffixOf, sort) import qualified Data.Map as M import Data.Maybe (fromMaybe, isJust, isNothing) @@ -95,7 +95,6 @@ import Text.Pandoc.Shared (eastAsianLineBreakFilter, stripEmptyParagraphs, headerShift, isURI, ordNub, safeRead, tabFilter, uriPathToPath) import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Writers.Math (defaultKaTeXURL, defaultMathJaxURL) -import Text.Pandoc.XML (toEntities) import Text.Printf #ifndef _WINDOWS import System.Posix.IO (stdOutput) @@ -443,6 +442,7 @@ convertWithOpts opts = do , writerTOCDepth = optTOCDepth opts , writerReferenceDoc = optReferenceDoc opts , writerSyntaxMap = syntaxMap + , writerPreferAscii = optAscii opts } let readerOpts = def{ @@ -519,19 +519,10 @@ convertWithOpts opts = do let htmlFormat = format `elem` ["html","html4","html5","s5","slidy", "slideous","dzslides","revealjs"] - escape - | optAscii opts - , htmlFormat || format == "docbook4" || - format == "docbook5" || format == "docbook" || - format == "jats" || format == "opml" || - format == "icml" = toEntities - | optAscii opts - , format == "ms" || format == "man" = groffEscape - | otherwise = id addNl = if standalone then id else (<> T.singleton '\n') - output <- (addNl . escape) <$> f writerOptions doc + output <- addNl <$> f writerOptions doc writerFn eol outputFile =<< if optSelfContained opts && htmlFormat -- TODO not maximally efficient; change type @@ -539,12 +530,6 @@ convertWithOpts opts = do then T.pack <$> makeSelfContained (T.unpack output) else return output -groffEscape :: Text -> Text -groffEscape = T.concatMap toUchar - where toUchar c - | isAscii c = T.singleton c - | otherwise = T.pack $ printf "\\[u%04X]" (ord c) - type Transform = Pandoc -> Pandoc isTextFormat :: String -> Bool @@ -606,7 +591,7 @@ data Opt = Opt , optPdfEngineArgs :: [String] -- ^ Flags to pass to the engine , optSlideLevel :: Maybe Int -- ^ Header level that creates slides , optSetextHeaders :: Bool -- ^ Use atx headers for markdown level 1-2 - , optAscii :: Bool -- ^ Use ascii characters only in html + , optAscii :: Bool -- ^ Prefer ascii output , optDefaultImageExtension :: String -- ^ Default image extension , optExtractMedia :: Maybe FilePath -- ^ Path to extract embedded media , optTrackChanges :: TrackChanges -- ^ Accept or reject MS Word track-changes. @@ -1173,7 +1158,7 @@ options = , Option "" ["ascii"] (NoArg (\opt -> return opt { optAscii = True })) - "" -- "Use ascii characters only in HTML output" + "" -- "Prefer ASCII output" , Option "" ["reference-links"] (NoArg diff --git a/src/Text/Pandoc/Groff.hs b/src/Text/Pandoc/Groff.hs new file mode 100644 index 000000000..46acc8fa8 --- /dev/null +++ b/src/Text/Pandoc/Groff.hs @@ -0,0 +1,43 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{- +Copyright (C) 2018 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Groff + Copyright : Copyright (C) 2018 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Shared functions for escaping and formatting groff. +-} +module Text.Pandoc.Groff ( groffEscape ) +where + +import Prelude +import Data.Char (isAscii, ord) +import qualified Data.Text as T +import Text.Printf (printf) + +groffEscape :: T.Text -> T.Text +groffEscape = T.concatMap toUchar + where toUchar c + | isAscii c = T.singleton c + | otherwise = T.pack $ printf "\\[u%04X]" (ord c) diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index f6e814095..3306e4f31 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -126,9 +126,10 @@ writeDocbook opts (Pandoc meta blocks) = do defField "mathml" (case writerHTMLMathMethod opts of MathML -> True _ -> False) metadata - case writerTemplate opts of - Nothing -> return main - Just tpl -> renderTemplate' tpl context + (if writerPreferAscii opts then toEntities else id) <$> + case writerTemplate opts of + Nothing -> return main + Just tpl -> renderTemplate' tpl context -- | Convert an Element to Docbook. elementToDocbook :: PandocMonad m => WriterOptions -> Int -> Element -> DB m Doc diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index c7f25197f..19ec4692e 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -75,7 +75,7 @@ import Text.Pandoc.Templates import Text.Pandoc.Walk import Text.Pandoc.Writers.Math import Text.Pandoc.Writers.Shared -import Text.Pandoc.XML (escapeStringForXML, fromEntities) +import Text.Pandoc.XML (escapeStringForXML, fromEntities, toEntities) #if MIN_VERSION_blaze_markup(0,6,3) #else import Text.Blaze.Internal (preEscapedString, preEscapedText) @@ -206,7 +206,8 @@ writeHtmlString' :: PandocMonad m => WriterState -> WriterOptions -> Pandoc -> m Text writeHtmlString' st opts d = do (body, context) <- evalStateT (pandocToHtml opts d) st - case writerTemplate opts of + (if writerPreferAscii opts then toEntities else id) <$> + case writerTemplate opts of Nothing -> return $ renderHtml' body Just tpl -> do -- warn if empty lang @@ -221,16 +222,19 @@ writeHtmlString' st opts d = do lookup "sourcefile" (writerVariables opts) report $ NoTitleElement fallback return $ resetField "pagetitle" fallback context - renderTemplate' tpl $ - defField "body" (renderHtml' body) context' + renderTemplate' tpl + (defField "body" (renderHtml' body) context') writeHtml' :: PandocMonad m => WriterState -> WriterOptions -> Pandoc -> m Html writeHtml' st opts d = case writerTemplate opts of Just _ -> preEscapedText <$> writeHtmlString' st opts d - Nothing -> do - (body, _) <- evalStateT (pandocToHtml opts d) st - return body + Nothing + | writerPreferAscii opts + -> preEscapedText <$> writeHtmlString' st opts d + | otherwise -> do + (body, _) <- evalStateT (pandocToHtml opts d) st + return body -- result is (title, authors, date, toc, body, new variables) pandocToHtml :: PandocMonad m diff --git a/src/Text/Pandoc/Writers/ICML.hs b/src/Text/Pandoc/Writers/ICML.hs index b8fc0dc94..ef1e2af0a 100644 --- a/src/Text/Pandoc/Writers/ICML.hs +++ b/src/Text/Pandoc/Writers/ICML.hs @@ -149,7 +149,8 @@ writeICML opts (Pandoc meta blocks) = do $ defField "charStyles" (render' $ charStylesToDoc st) $ defField "parStyles" (render' $ parStylesToDoc st) $ defField "hyperlinks" (render' $ hyperlinksToDoc $ links st) metadata - case writerTemplate opts of + (if writerPreferAscii opts then toEntities else id) <$> + case writerTemplate opts of Nothing -> return main Just tpl -> renderTemplate' tpl context diff --git a/src/Text/Pandoc/Writers/JATS.hs b/src/Text/Pandoc/Writers/JATS.hs index f55a49d4e..4e78a4cce 100644 --- a/src/Text/Pandoc/Writers/JATS.hs +++ b/src/Text/Pandoc/Writers/JATS.hs @@ -102,7 +102,8 @@ docToJATS opts (Pandoc meta blocks) = do $ defField "mathml" (case writerHTMLMathMethod opts of MathML -> True _ -> False) metadata - case writerTemplate opts of + (if writerPreferAscii opts then toEntities else id) <$> + case writerTemplate opts of Nothing -> return main Just tpl -> renderTemplate' tpl context diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs index 7be3fce28..c1b5d0fa4 100644 --- a/src/Text/Pandoc/Writers/LaTeX.hs +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -42,8 +42,9 @@ import Data.Aeson (FromJSON, object, (.=)) import Data.Char (isAlphaNum, isAscii, isDigit, isLetter, isPunctuation, ord, toLower) import Data.List (foldl', intercalate, intersperse, isInfixOf, nubBy, - stripPrefix, (\\)) + stripPrefix, (\\), uncons) import Data.Maybe (catMaybes, fromMaybe, isJust, mapMaybe, isNothing) +import qualified Data.Map as M import Data.Text (Text) import qualified Data.Text as T import Network.URI (unEscapeString) @@ -63,6 +64,7 @@ import Text.Pandoc.Walk import Text.Pandoc.Writers.Shared import qualified Text.Parsec as P import Text.Printf (printf) +import qualified Data.Text.Normalize as Normalize data WriterState = WriterState { stInNote :: Bool -- true if we're in a note @@ -318,46 +320,110 @@ data StringContext = TextString -- escape things as needed for LaTeX stringToLaTeX :: PandocMonad m => StringContext -> String -> LW m String -stringToLaTeX _ [] = return "" -stringToLaTeX ctx (x:xs) = do +stringToLaTeX context zs = do opts <- gets stOptions - rest <- stringToLaTeX ctx xs - let ligatures = isEnabled Ext_smart opts && ctx == TextString - let isUrl = ctx == URLString - return $ + go opts context $ + if writerPreferAscii opts + then T.unpack $ Normalize.normalize Normalize.NFD $ T.pack zs + else zs + where + go _ _ [] = return "" + go opts ctx (x:xs) = do + let ligatures = isEnabled Ext_smart opts && ctx == TextString + let isUrl = ctx == URLString + let mbAccentCmd = + if writerPreferAscii opts && ctx == TextString + then uncons xs >>= \(c,_) -> M.lookup c accents + else Nothing + let emits s = + case mbAccentCmd of + Just cmd -> ((cmd ++ "{" ++ s ++ "}") ++) + <$> go opts ctx (drop 1 xs) -- drop combining accent + Nothing -> (s++) <$> go opts ctx xs + let emitc c = + case mbAccentCmd of + Just cmd -> ((cmd ++ "{" ++ [c] ++ "}") ++) + <$> go opts ctx (drop 1 xs) -- drop combining accent + Nothing -> (c:) <$> go opts ctx xs case x of - '{' -> "\\{" ++ rest - '}' -> "\\}" ++ rest - '`' | ctx == CodeString -> "\\textasciigrave{}" ++ rest - '$' | not isUrl -> "\\$" ++ rest - '%' -> "\\%" ++ rest - '&' -> "\\&" ++ rest - '_' | not isUrl -> "\\_" ++ rest - '#' -> "\\#" ++ rest - '-' | not isUrl -> case xs of - -- prevent adjacent hyphens from forming ligatures - ('-':_) -> "-\\/" ++ rest - _ -> '-' : rest - '~' | not isUrl -> "\\textasciitilde{}" ++ rest - '^' -> "\\^{}" ++ rest - '\\'| isUrl -> '/' : rest -- NB. / works as path sep even on Windows - | otherwise -> "\\textbackslash{}" ++ rest - '|' | not isUrl -> "\\textbar{}" ++ rest - '<' -> "\\textless{}" ++ rest - '>' -> "\\textgreater{}" ++ rest - '[' -> "{[}" ++ rest -- to avoid interpretation as - ']' -> "{]}" ++ rest -- optional arguments - '\'' | ctx == CodeString -> "\\textquotesingle{}" ++ rest - '\160' -> "~" ++ rest - '\x202F' -> "\\," ++ rest - '\x2026' -> "\\ldots{}" ++ rest - '\x2018' | ligatures -> "`" ++ rest - '\x2019' | ligatures -> "'" ++ rest - '\x201C' | ligatures -> "``" ++ rest - '\x201D' | ligatures -> "''" ++ rest - '\x2014' | ligatures -> "---" ++ rest - '\x2013' | ligatures -> "--" ++ rest - _ -> x : rest + '{' -> emits "\\{" + '}' -> emits "\\}" + '`' | ctx == CodeString -> emits "\\textasciigrave{}" + '$' | not isUrl -> emits "\\$" + '%' -> emits "\\%" + '&' -> emits "\\&" + '_' | not isUrl -> emits "\\_" + '#' -> emits "\\#" + '-' | not isUrl -> case xs of + -- prevent adjacent hyphens from forming ligatures + ('-':_) -> emits "-\\/" + _ -> emitc '-' + '~' | not isUrl -> emits "\\textasciitilde{}" + '^' -> emits "\\^{}" + '\\'| isUrl -> emitc '/' -- NB. / works as path sep even on Windows + | otherwise -> emits "\\textbackslash{}" + '|' | not isUrl -> emits "\\textbar{}" + '<' -> emits "\\textless{}" + '>' -> emits "\\textgreater{}" + '[' -> emits "{[}" -- to avoid interpretation as + ']' -> emits "{]}" -- optional arguments + '\'' | ctx == CodeString -> emits "\\textquotesingle{}" + '\160' -> emits "~" + '\x202F' -> emits "\\," + '\x2026' -> emits "\\ldots{}" + '\x2018' | ligatures -> emits "`" + '\x2019' | ligatures -> emits "'" + '\x201C' | ligatures -> emits "``" + '\x201D' | ligatures -> emits "''" + '\x2014' | ligatures -> emits "---" + '\x2013' | ligatures -> emits "--" + _ | writerPreferAscii opts + -> case x of + 'ı' -> emits "\\i " + 'ȷ' -> emits "\\j " + 'å' -> emits "\\aa " + 'Å' -> emits "\\AA " + 'ß' -> emits "\\ss " + 'ø' -> emits "\\o " + 'Ø' -> emits "\\O " + 'Ł' -> emits "\\L " + 'ł' -> emits "\\l " + 'æ' -> emits "\\ae " + 'Æ' -> emits "\\AE " + 'œ' -> emits "\\oe " + 'Œ' -> emits "\\OE " + '£' -> emits "\\pounds " + '€' -> emits "\\euro " + '©' -> emits "\\copyright " + _ -> emitc x + | otherwise -> emitc x + +accents :: M.Map Char String +accents = M.fromList + [ ('\779' , "\\H") + , ('\768' , "\\`") + , ('\769' , "\\'") + , ('\770' , "\\^") + , ('\771' , "\\~") + , ('\776' , "\\\"") + , ('\775' , "\\.") + , ('\772' , "\\=") + , ('\781' , "\\|") + , ('\817' , "\\b") + , ('\807' , "\\c") + , ('\783' , "\\G") + , ('\777' , "\\h") + , ('\803' , "\\d") + , ('\785' , "\\f") + , ('\778' , "\\r") + , ('\865' , "\\t") + , ('\782' , "\\U") + , ('\780' , "\\v") + , ('\774' , "\\u") + , ('\808' , "\\k") + , ('\785' , "\\newtie") + , ('\8413', "\\textcircled") + ] toLabel :: PandocMonad m => String -> LW m String toLabel z = go `fmap` stringToLaTeX URLString z diff --git a/src/Text/Pandoc/Writers/Man.hs b/src/Text/Pandoc/Writers/Man.hs index be490bf22..b6b72d07f 100644 --- a/src/Text/Pandoc/Writers/Man.hs +++ b/src/Text/Pandoc/Writers/Man.hs @@ -45,6 +45,7 @@ import Text.Pandoc.Logging import Text.Pandoc.Options import Text.Pandoc.Pretty import Text.Pandoc.Shared +import Text.Pandoc.Groff (groffEscape) import Text.Pandoc.Templates import Text.Pandoc.Writers.Math import Text.Pandoc.Writers.Shared @@ -107,7 +108,8 @@ pandocToMan opts (Pandoc meta blocks) = do $ defField "has-tables" hasTables $ defField "hyphenate" True $ defField "pandoc-version" pandocVersion metadata - case writerTemplate opts of + (if writerPreferAscii opts then groffEscape else id) <$> + case writerTemplate opts of Nothing -> return main Just tpl -> renderTemplate' tpl context diff --git a/src/Text/Pandoc/Writers/Ms.hs b/src/Text/Pandoc/Writers/Ms.hs index 3dcf816b8..a29524bbb 100644 --- a/src/Text/Pandoc/Writers/Ms.hs +++ b/src/Text/Pandoc/Writers/Ms.hs @@ -60,6 +60,7 @@ import Text.Pandoc.Shared import Text.Pandoc.Templates import Text.Pandoc.Writers.Math import Text.Pandoc.Writers.Shared +import Text.Pandoc.Groff (groffEscape) import Text.Printf (printf) import Text.TeXMath (writeEqn) @@ -127,7 +128,8 @@ pandocToMs opts (Pandoc meta blocks) = do $ defField "title-meta" titleMeta $ defField "author-meta" (intercalate "; " authorsMeta) $ defField "highlighting-macros" highlightingMacros metadata - case writerTemplate opts of + (if writerPreferAscii opts then groffEscape else id) <$> + case writerTemplate opts of Nothing -> return main Just tpl -> renderTemplate' tpl context diff --git a/src/Text/Pandoc/Writers/OPML.hs b/src/Text/Pandoc/Writers/OPML.hs index 6c48046a2..716c5cbad 100644 --- a/src/Text/Pandoc/Writers/OPML.hs +++ b/src/Text/Pandoc/Writers/OPML.hs @@ -62,7 +62,8 @@ writeOPML opts (Pandoc meta blocks) = do meta' main <- (render colwidth . vcat) <$> mapM (elementToOPML opts) elements let context = defField "body" main metadata - case writerTemplate opts of + (if writerPreferAscii opts then toEntities else id) <$> + case writerTemplate opts of Nothing -> return main Just tpl -> renderTemplate' tpl context |