aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2018-10-23 21:38:21 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2018-10-23 21:44:07 -0700
commit8efb8975ed641ddd075954e1ccc7f71eca1d3c16 (patch)
treeb1b3f561de86e3d012f1b5f1e82f141e32b43926
parent556e3eef4a26eab9ea769e06668fcba546d76b18 (diff)
downloadpandoc-8efb8975ed641ddd075954e1ccc7f71eca1d3c16.tar.gz
Groff writer character escaping changes.
T.P.GroffChar: replaced `essentialEscapes` with `manEscapes`, which includes all the escapes mentioned in the groff_man manual. T.P.Writers.Groff: removed escapeCode; changed parameter on escapeString from Bool to new type `EscapeMode`. Rewrote `escapeString`.
-rw-r--r--src/Text/Pandoc/GroffChar.hs23
-rw-r--r--src/Text/Pandoc/Writers/Groff.hs52
-rw-r--r--src/Text/Pandoc/Writers/Man.hs20
-rw-r--r--src/Text/Pandoc/Writers/Ms.hs30
-rw-r--r--test/command/4550.md2
-rw-r--r--test/tables.ms8
-rw-r--r--test/writer.man6
-rw-r--r--test/writer.ms46
8 files changed, 101 insertions, 86 deletions
diff --git a/src/Text/Pandoc/GroffChar.hs b/src/Text/Pandoc/GroffChar.hs
index 6d7991e5d..efb3cf11a 100644
--- a/src/Text/Pandoc/GroffChar.hs
+++ b/src/Text/Pandoc/GroffChar.hs
@@ -31,24 +31,29 @@ Groff character escaping/unescaping.
-}
module Text.Pandoc.GroffChar (
- essentialEscapes
+ manEscapes
, characterCodes
, combiningAccents
) where
import Prelude
-import qualified Data.Map as Map
-essentialEscapes :: Map.Map Char String
-essentialEscapes = Map.fromList
- [ ('\160', "\\~")
+-- | These are the escapes specifically mentioned in groff_man(7).
+manEscapes :: [(Char, String)]
+manEscapes =
+ [ ('\160', "\\ ")
, ('\'', "\\[aq]")
- , ('`', "\\[ga]")
+ , ('‘', "\\[oq]")
+ , ('’', "\\[cq]")
, ('"', "\\[dq]")
- , ('~', "\\[ti]")
+ , ('“', "\\[lq]")
+ , ('”', "\\[rq]")
+ , ('—', "\\[em]")
+ , ('–', "\\[en]")
+ , ('`', "\\[ga]")
, ('^', "\\[ha]")
- , ('@', "\\[at]")
- , ('\\', "\\[rs]")
+ , ('~', "\\[ti]")
, ('-', "\\-") -- minus; - will be interpreted as hyphen U+2010
+ , ('\\', "\\[rs]")
, ('\x2026', "\\&...") -- because u2026 doesn't render on tty
]
diff --git a/src/Text/Pandoc/Writers/Groff.hs b/src/Text/Pandoc/Writers/Groff.hs
index fb3cc085b..b0e8d3d06 100644
--- a/src/Text/Pandoc/Writers/Groff.hs
+++ b/src/Text/Pandoc/Writers/Groff.hs
@@ -34,22 +34,21 @@ module Text.Pandoc.Writers.Groff (
, defaultWriterState
, MS
, Note
+ , EscapeMode(..)
, escapeString
- , escapeCode
, withFontFeature
) where
import Prelude
import Data.Char (ord, isAscii)
import Control.Monad.State.Strict
-import Data.List (intercalate)
import qualified Data.Map as Map
import Data.Maybe (fromMaybe, isJust, catMaybes)
import Text.Pandoc.Class (PandocMonad)
import Text.Pandoc.Definition
import Text.Pandoc.Pretty
import Text.Printf (printf)
-import Text.Pandoc.GroffChar (essentialEscapes, characterCodes,
- combiningAccents)
+import Text.Pandoc.GroffChar (manEscapes,
+ characterCodes, combiningAccents)
data WriterState = WriterState { stHasInlineMath :: Bool
, stFirstPara :: Bool
@@ -80,33 +79,38 @@ type Note = [Block]
type MS = StateT WriterState
+data EscapeMode = AllowUTF8 -- ^ use preferred man escapes
+ | AsciiOnly -- ^ escape everything
+ deriving Show
+
combiningAccentsMap :: Map.Map Char String
combiningAccentsMap = Map.fromList combiningAccents
+essentialEscapes :: Map.Map Char String
+essentialEscapes = Map.fromList manEscapes
+
-- | Escape special characters for groff.
-escapeString :: Bool -> String -> String
+escapeString :: EscapeMode -> String -> String
escapeString _ [] = []
-escapeString useAscii (x:xs) =
+escapeString escapeMode ('\n':'.':xs) =
+ '\n':'\\':'&':'.':escapeString escapeMode xs
+escapeString escapeMode (x:xs) =
case Map.lookup x essentialEscapes of
- Just s -> s ++ escapeString useAscii xs
+ Just s -> s ++ escapeString escapeMode xs
Nothing
- | isAscii x || not useAscii -> x : escapeString useAscii xs
- | otherwise ->
- let accents = catMaybes $ takeWhile isJust
- (map (\c -> Map.lookup c combiningAccentsMap) xs)
- rest = drop (length accents) xs
- s = case Map.lookup x characterCodeMap of
- Just t -> "\\[" <> unwords (t:accents) <> "]"
- Nothing -> "\\[" <> unwords
- (printf "u%04X" (ord x) : accents) <> "]"
- in s ++ escapeString useAscii rest
-
--- | Escape a literal (code) section for groff.
-escapeCode :: Bool -> String -> String
-escapeCode useAscii = intercalate "\n" . map escapeLine . lines
- where escapeLine xs = case xs of
- ('.':_) -> "\\%" ++ escapeString useAscii xs
- _ -> escapeString useAscii xs
+ | isAscii x -> x : escapeString escapeMode xs
+ | otherwise ->
+ case escapeMode of
+ AllowUTF8 -> x : escapeString escapeMode xs
+ AsciiOnly ->
+ let accents = catMaybes $ takeWhile isJust
+ (map (\c -> Map.lookup c combiningAccentsMap) xs)
+ rest = drop (length accents) xs
+ s = case Map.lookup x characterCodeMap of
+ Just t -> "\\[" <> unwords (t:accents) <> "]"
+ Nothing -> "\\[" <> unwords
+ (printf "u%04X" (ord x) : accents) <> "]"
+ in s ++ escapeString escapeMode rest
characterCodeMap :: Map.Map Char String
characterCodeMap = Map.fromList characterCodes
diff --git a/src/Text/Pandoc/Writers/Man.hs b/src/Text/Pandoc/Writers/Man.hs
index 645476b77..b32d2ff6c 100644
--- a/src/Text/Pandoc/Writers/Man.hs
+++ b/src/Text/Pandoc/Writers/Man.hs
@@ -97,6 +97,9 @@ pandocToMan opts (Pandoc meta blocks) = do
Nothing -> return main
Just tpl -> renderTemplate' tpl context
+escString :: WriterOptions -> String -> String
+escString _ = escapeString AsciiOnly -- for better portability
+
-- | Return man representation of notes.
notesToMan :: PandocMonad m => WriterOptions -> [[Block]] -> StateT WriterState m Doc
notesToMan opts notes =
@@ -143,11 +146,14 @@ blockToMan opts (Header level _ inlines) = do
1 -> ".SH "
_ -> ".SS "
return $ text heading <> contents
-blockToMan _ (CodeBlock _ str) = return $
+blockToMan opts (CodeBlock _ str) = return $
text ".IP" $$
text ".nf" $$
text "\\f[C]" $$
- text (escapeCode True str) $$
+ ((case str of
+ '.':_ -> text "\\&"
+ _ -> mempty) <>
+ text (escString opts str)) $$
text "\\f[R]" $$
text ".fi"
blockToMan opts (BlockQuote blocks) = do
@@ -296,11 +302,11 @@ inlineToMan opts (Quoted DoubleQuote lst) = do
return $ text "\\[lq]" <> contents <> text "\\[rq]"
inlineToMan opts (Cite _ lst) =
inlineListToMan opts lst
-inlineToMan _ (Code _ str) =
- withFontFeature 'C' (return (text $ escapeCode True str))
-inlineToMan _ (Str str@('.':_)) =
- return $ afterBreak "\\&" <> text (escapeString True str)
-inlineToMan _ (Str str) = return $ text $ escapeString True str
+inlineToMan opts (Code _ str) =
+ withFontFeature 'C' (return (text $ escString opts str))
+inlineToMan opts (Str str@('.':_)) =
+ return $ afterBreak "\\&" <> text (escString opts str)
+inlineToMan opts (Str str) = return $ text $ escString opts str
inlineToMan opts (Math InlineMath str) =
lift (texMathToInlines InlineMath str) >>= inlineListToMan opts
inlineToMan opts (Math DisplayMath str) = do
diff --git a/src/Text/Pandoc/Writers/Ms.hs b/src/Text/Pandoc/Writers/Ms.hs
index e077e9ed9..2fb949cb9 100644
--- a/src/Text/Pandoc/Writers/Ms.hs
+++ b/src/Text/Pandoc/Writers/Ms.hs
@@ -69,9 +69,6 @@ writeMs :: PandocMonad m => WriterOptions -> Pandoc -> m Text
writeMs opts document =
evalStateT (pandocToMs opts document) defaultWriterState
-escString :: WriterOptions -> String -> String
-escString opts = escapeString (writerPreferAscii opts)
-
-- | Return groff ms representation of document.
pandocToMs :: PandocMonad m => WriterOptions -> Pandoc -> MS m Text
pandocToMs opts (Pandoc meta blocks) = do
@@ -87,8 +84,8 @@ pandocToMs opts (Pandoc meta blocks) = do
body <- blockListToMs opts blocks
let main = render' body
hasInlineMath <- gets stHasInlineMath
- let titleMeta = (escString opts . stringify) $ docTitle meta
- let authorsMeta = map (escString opts . stringify) $ docAuthors meta
+ let titleMeta = (escapeStr opts . stringify) $ docTitle meta
+ let authorsMeta = map (escapeStr opts . stringify) $ docAuthors meta
hasHighlighting <- gets stHighlighting
let highlightingMacros = if hasHighlighting
then case writerHighlightStyle opts of
@@ -108,6 +105,10 @@ pandocToMs opts (Pandoc meta blocks) = do
Nothing -> return main
Just tpl -> renderTemplate' tpl context
+escapeStr :: WriterOptions -> String -> String
+escapeStr opts =
+ escapeString (if writerPreferAscii opts then AsciiOnly else AllowUTF8)
+
escapeUri :: String -> String
escapeUri = escapeURIString (\c -> c /= '@' && isAllowedInURI c)
@@ -121,11 +122,11 @@ toSmallCaps :: WriterOptions -> String -> String
toSmallCaps _ [] = []
toSmallCaps opts (c:cs)
| isLower c = let (lowers,rest) = span isLower (c:cs)
- in "\\s-2" ++ escString opts (map toUpper lowers) ++
+ in "\\s-2" ++ escapeStr opts (map toUpper lowers) ++
"\\s0" ++ toSmallCaps opts rest
| isUpper c = let (uppers,rest) = span isUpper (c:cs)
- in escString opts uppers ++ toSmallCaps opts rest
- | otherwise = escapeString (writerPreferAscii opts) [c] ++ toSmallCaps opts cs
+ in escapeStr opts uppers ++ toSmallCaps opts rest
+ | otherwise = escapeStr opts [c] ++ toSmallCaps opts cs
-- We split inline lists into sentences, and print one sentence per
-- line. groff/troff treats the line-ending period differently.
@@ -162,7 +163,7 @@ blockToMs opts (Para [Image attr alt (src,_tit)])
_ -> empty
capt <- inlineListToMs' opts alt
return $ nowrap (text ".PSPIC -C " <>
- doubleQuotes (text (escString opts src)) <>
+ doubleQuotes (text (escapeStr opts src)) <>
sizeAttrs) $$
text ".ce 1000" $$
capt $$
@@ -200,7 +201,7 @@ blockToMs opts (Header level (ident,classes,_) inlines) = do
(if null secnum
then ""
else " ") ++
- escString opts (stringify inlines))
+ escapeStr opts (stringify inlines))
let backlink = nowrap (text ".pdfhref L -D " <>
doubleQuotes (text (toAscii ident)) <> space <> text "\\") <> cr <>
text " -- "
@@ -409,7 +410,7 @@ inlineToMs opts (Str str) = do
smallcaps <- gets stSmallCaps
if smallcaps
then return $ shim <> text (toSmallCaps opts str)
- else return $ shim <> text (escString opts str)
+ else return $ shim <> text (escapeStr opts str)
inlineToMs opts (Math InlineMath str) = do
modify $ \st -> st{ stHasInlineMath = True }
res <- convertMath writeEqn InlineMath str
@@ -453,7 +454,7 @@ inlineToMs opts (Link _ txt (src, _)) = do
text " -- " <> doubleQuotes (nowrap contents) <> cr <> text "\\&"
inlineToMs opts (Image _ alternate (_, _)) =
return $ char '[' <> text "IMAGE: " <>
- text (escString opts (stringify alternate))
+ text (escapeStr opts (stringify alternate))
<> char ']'
inlineToMs _ (Note contents) = do
modify $ \st -> st{ stNotes = contents : stNotes st }
@@ -540,15 +541,14 @@ msFormatter opts _fmtopts =
where fmtLine = hcat . map fmtToken
fmtToken (toktype, tok) = text "\\*" <>
brackets (text (show toktype) <> text " \""
- <> text (escapeCode (writerPreferAscii opts)
- (T.unpack tok)) <> text "\"")
+ <> text (escapeStr opts (T.unpack tok)) <> text "\"")
highlightCode :: PandocMonad m => WriterOptions -> Attr -> String -> MS m Doc
highlightCode opts attr str =
case highlight (writerSyntaxMap opts) (msFormatter opts) attr str of
Left msg -> do
unless (null msg) $ report $ CouldNotHighlight msg
- return $ text (escapeCode (writerPreferAscii opts) str)
+ return $ text (escapeStr opts str)
Right h -> do
modify (\st -> st{ stHighlighting = True })
return h
diff --git a/test/command/4550.md b/test/command/4550.md
index bf3afce5b..45ed21a00 100644
--- a/test/command/4550.md
+++ b/test/command/4550.md
@@ -3,5 +3,5 @@
A ‘simple’ example
^D
.LP
-A ‘simple’ example
+A \[oq]simple\[cq] example
```
diff --git a/test/tables.ms b/test/tables.ms
index 6d9138c64..90662aaad 100644
--- a/test/tables.ms
+++ b/test/tables.ms
@@ -135,7 +135,7 @@ T}
.LP
Multiline table with caption:
.PP
-Here’s the caption. It may span multiple lines.
+Here\[cq]s the caption. It may span multiple lines.
.TS
delim(@@) tab( );
cw(10.5n) lw(9.6n) rw(11.4n) lw(24.5n).
@@ -165,7 +165,7 @@ row
T} T{
5.0
T} T{
-Here’s another one.
+Here\[cq]s another one.
Note the blank line between rows.
T}
.TE
@@ -201,7 +201,7 @@ row
T} T{
5.0
T} T{
-Here’s another one.
+Here\[cq]s another one.
Note the blank line between rows.
T}
.TE
@@ -261,7 +261,7 @@ row
T} T{
5.0
T} T{
-Here’s another one.
+Here\[cq]s another one.
Note the blank line between rows.
T}
.TE
diff --git a/test/writer.man b/test/writer.man
index 4a05b74fd..4dab58c21 100644
--- a/test/writer.man
+++ b/test/writer.man
@@ -308,7 +308,7 @@ Nested.
.PP
Should not be a list item:
.PP
-M.A.\~2007
+M.A.\ 2007
.PP
B.
Williams
@@ -492,9 +492,9 @@ This is code: \f[C]>\f[R], \f[C]$\f[R], \f[C]\[rs]\f[R], \f[C]\[rs]$\f[R],
.PP
[STRIKEOUT:This is \f[I]strikeout\f[R].]
.PP
-Superscripts: a^bc^d a^\f[I]hello\f[R]^ a^hello\~there^.
+Superscripts: a^bc^d a^\f[I]hello\f[R]^ a^hello\ there^.
.PP
-Subscripts: H~2~O, H~23~O, H~many\~of\~them~O.
+Subscripts: H~2~O, H~23~O, H~many\ of\ them~O.
.PP
These should not be superscripts or subscripts, because of the unescaped
spaces: a\[ha]b c\[ha]d, a\[ti]b c\[ti]d.
diff --git a/test/writer.ms b/test/writer.ms
index 40ecddb27..0cb60a3bc 100644
--- a/test/writer.ms
+++ b/test/writer.ms
@@ -75,7 +75,7 @@ July 17, 2006
.1C
.LP
This is a set of tests for pandoc.
-Most of them are adapted from John Gruber’s markdown test suite.
+Most of them are adapted from John Gruber\[cq]s markdown test suite.
.HLINE
.SH 1
Headers
@@ -126,7 +126,7 @@ Paragraphs
.pdfhref O 1 "Paragraphs"
.pdfhref M "paragraphs"
.LP
-Here’s a regular paragraph.
+Here\[cq]s a regular paragraph.
.PP
In Markdown 1.0.0 and earlier.
Version 8.
@@ -134,7 +134,7 @@ This line turns into a list item.
Because a hard\-wrapped line in the middle of a paragraph looked like a list
item.
.PP
-Here’s one with a bullet.
+Here\[cq]s one with a bullet.
* criminey.
.PP
There should be a hard line break
@@ -314,7 +314,7 @@ Item 1, graf one.
.PP
Item 1.
graf two.
-The quick brown fox jumped over the lazy dog’s back.
+The quick brown fox jumped over the lazy dog\[cq]s back.
.RE
.IP " 2." 4
Item 2.
@@ -335,7 +335,7 @@ Tab
.RE
.RE
.LP
-Here’s another:
+Here\[cq]s another:
.IP " 1." 4
First
.IP " 2." 4
@@ -431,7 +431,7 @@ Nested.
.LP
Should not be a list item:
.PP
-M.A.\~2007
+M.A.\ 2007
.PP
B.
Williams
@@ -570,7 +570,7 @@ Interpreted markdown in a table:
This is \f[I]emphasized\f[R]
And this is \f[B]strong\f[R]
.PP
-Here’s a simple block:
+Here\[cq]s a simple block:
.LP
foo
.LP
@@ -617,7 +617,7 @@ Code:
\f[]
.fi
.LP
-Hr’s:
+Hr\[cq]s:
.HLINE
.SH 1
Inline Markup
@@ -646,9 +646,9 @@ This is code: \f[C]>\f[R], \f[C]$\f[R], \f[C]\[rs]\f[R], \f[C]\[rs]$\f[R],
.PP
\m[strikecolor]This is \f[I]strikeout\f[R].\m[]
.PP
-Superscripts: a\*{bc\*}d a\*{\f[I]hello\f[R]\*} a\*{hello\~there\*}.
+Superscripts: a\*{bc\*}d a\*{\f[I]hello\f[R]\*} a\*{hello\ there\*}.
.PP
-Subscripts: H\*<2\*>O, H\*<23\*>O, H\*<many\~of\~them\*>O.
+Subscripts: H\*<2\*>O, H\*<23\*>O, H\*<many\ of\ them\*>O.
.PP
These should not be superscripts or subscripts, because of the unescaped
spaces: a\[ha]b c\[ha]d, a\[ti]b c\[ti]d.
@@ -666,16 +666,16 @@ Smart quotes, ellipses, dashes
`Oak,' `elm,' and `beech' are names of trees.
So is `pine.'
.PP
-`He said, \[lq]I want to go.\[rq]' Were you alive in the 70’s?
+`He said, \[lq]I want to go.\[rq]' Were you alive in the 70\[cq]s?
.PP
Here is some quoted `\f[C]code\f[R]' and a \[lq]\c
.pdfhref W -D "http://example.com/?foo=1&bar=2" -A "\c" \
-- "quoted link"
\&\[rq].
.PP
-Some dashes: one—two — three—four — five.
+Some dashes: one\[em]two \[em] three\[em]four \[em] five.
.PP
-Dashes between numbers: 5–7, 255–66, 1987–1999.
+Dashes between numbers: 5\[en]7, 255\[en]66, 1987\[en]1999.
.PP
Ellipses\&...and\&...and\&....
.HLINE
@@ -695,14 +695,14 @@ LaTeX
.IP \[bu] 3
@p@\-Tree
.IP \[bu] 3
-Here’s some display math:
+Here\[cq]s some display math:
.EQ
d over {d x} f ( x ) = lim sub {h -> 0} {f ( x + h ) \[u2212] f ( x )} over h
.EN
.IP \[bu] 3
-Here’s one that has a line break in it: @alpha + omega times x sup 2@.
+Here\[cq]s one that has a line break in it: @alpha + omega times x sup 2@.
.LP
-These shouldn’t be math:
+These shouldn\[cq]t be math:
.IP \[bu] 3
To get the famous equation, write \f[C]$e = mc\[ha]2$\f[R].
.IP \[bu] 3
@@ -714,7 +714,7 @@ Shoes ($20) and socks ($5).
.IP \[bu] 3
Escaped \f[C]$\f[R]: $73 \f[I]this should be emphasized\f[R] 23$.
.LP
-Here’s a LaTeX table:
+Here\[cq]s a LaTeX table:
.HLINE
.SH 1
Special Characters
@@ -885,22 +885,22 @@ With ampersands
.pdfhref O 2 "With ampersands"
.pdfhref M "with-ampersands"
.LP
-Here’s a \c
+Here\[cq]s a \c
.pdfhref W -D "http://example.com/?foo=1&bar=2" -A "\c" \
-- "link with an ampersand in the URL"
\&.
.PP
-Here’s a link with an amersand in the link text: \c
+Here\[cq]s a link with an amersand in the link text: \c
.pdfhref W -D "http://att.com/" -A "\c" \
-- "AT&T"
\&.
.PP
-Here’s an \c
+Here\[cq]s an \c
.pdfhref W -D "/script?foo=1&bar=2" -A "\c" \
-- "inline link"
\&.
.PP
-Here’s an \c
+Here\[cq]s an \c
.pdfhref W -D "/script?foo=1&bar=2" -A "\c" \
-- "inline link in pointy braces"
\&.
@@ -925,7 +925,7 @@ It should.
.LP
An e\-mail address: \c
.pdfhref W -D "mailto:nobody%40nowhere.net" -A "\c" \
- -- "nobody\[at]nowhere.net"
+ -- "nobody@nowhere.net"
\&
.RS
.LP
@@ -967,7 +967,7 @@ It need not be placed at the end of the document.
.FE
and another.\**
.FS
-Here’s the long note.
+Here\[cq]s the long note.
This one contains multiple blocks.
.PP
Subsequent blocks are indented to show that they belong to the footnote (as