aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <fiddlosopher@gmail.com>2012-01-01 13:48:28 -0800
committerJohn MacFarlane <fiddlosopher@gmail.com>2012-01-01 13:48:28 -0800
commitda8425598a8ab4a98388e8ee346a2ae7ec540aa0 (patch)
tree20078094309f4fc57ea67e5b2d17163f86078ec1
parent3cf60c73061f247b531da4b3c18664c6134bee53 (diff)
downloadpandoc-da8425598a8ab4a98388e8ee346a2ae7ec540aa0.tar.gz
New treatment of dashes in --smart mode.
* `---` is always em-dash, `--` is always en-dash. * pandoc no longer tries to guess when `-` should be en-dash. * A new option, `--old-dashes`, is provided for legacy documents. Rationale: The rules for en-dash are too complex and language-dependent for a guesser to work reliably. This change gives users greater control. The alternative of using unicode isn't very good, since unicode em- and en- dashes are barely distinguishable in a monospace font.
-rw-r--r--README16
-rw-r--r--src/Text/Pandoc/Parsing.hs34
-rw-r--r--src/Text/Pandoc/Readers/Textile.hs3
-rw-r--r--src/pandoc.hs10
-rw-r--r--tests/rst-reader.native4
-rw-r--r--tests/testsuite.txt4
6 files changed, 56 insertions, 15 deletions
diff --git a/README b/README
index 9380c895e..5b4de3942 100644
--- a/README
+++ b/README
@@ -206,11 +206,17 @@ Options
`-S`, `--smart`
: Produce typographically correct output, converting straight quotes
- to curly quotes, `---` and `--` to dashes, and `...` to ellipses.
- Nonbreaking spaces are inserted after certain abbreviations, such
- as "Mr." (Note: This option is significant only when the input format is
- `markdown` or `textile`. It is selected automatically when the input
- format is `textile` or the output format is `latex` or `context`.)
+ to curly quotes, `---` to em-dashes, `--` to en-dashes, and
+ `...` to ellipses. Nonbreaking spaces are inserted after certain
+ abbreviations, such as "Mr." (Note: This option is significant only when
+ the input format is `markdown` or `textile`. It is selected automatically
+ when the input format is `textile` or the output format is `latex` or
+ `context`.)
+
+`--old-dashes`
+: Selects the pandoc <= 1.8.2.1 behavior for parsing smart dashes: `-` before
+ a numeral is an en-dash, and `--` is an em-dash. This option is selected
+ automatically for `textile` input.
`-5`, `--html5`
: Produce HTML5 instead of HTML4. This option has no effect for writers
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index c2c512033..71da3a730 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -614,6 +614,9 @@ data ParserState = ParserState
stateDate :: [Inline], -- ^ Date of document
stateStrict :: Bool, -- ^ Use strict markdown syntax?
stateSmart :: Bool, -- ^ Use smart typography?
+ stateOldDashes :: Bool, -- ^ Use pandoc <= 1.8.2.1 behavior
+ -- in parsing dashes; -- is em-dash;
+ -- before numeral is en-dash
stateLiterateHaskell :: Bool, -- ^ Treat input as literate haskell
stateColumns :: Int, -- ^ Number of columns in terminal
stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used
@@ -642,6 +645,7 @@ defaultParserState =
stateDate = [],
stateStrict = False,
stateSmart = False,
+ stateOldDashes = False,
stateLiterateHaskell = False,
stateColumns = 80,
stateHeaderTable = [],
@@ -788,17 +792,37 @@ ellipses = do
try (charOrRef "\8230\133") <|> try (string "..." >> return '…')
return (Str "\8230")
-dash :: GenParser Char st Inline
-dash = enDash <|> emDash
+dash :: GenParser Char ParserState Inline
+dash = do
+ oldDashes <- stateOldDashes `fmap` getState
+ if oldDashes
+ then emDashOld <|> enDashOld
+ else Str `fmap` (hyphenDash <|> emDash <|> enDash)
-enDash :: GenParser Char st Inline
+-- Two hyphens = en-dash, three = em-dash
+hyphenDash :: GenParser Char st String
+hyphenDash = do
+ try $ string "--"
+ option "\8211" (char '-' >> return "\8212")
+
+emDash :: GenParser Char st String
+emDash = do
+ try (charOrRef "\8212\151")
+ return "\8212"
+
+enDash :: GenParser Char st String
enDash = do
+ try (charOrRef "\8212\151")
+ return "\8211"
+
+enDashOld :: GenParser Char st Inline
+enDashOld = do
try (charOrRef "\8211\150") <|>
try (char '-' >> lookAhead (satisfy isDigit) >> return '–')
return (Str "\8211")
-emDash :: GenParser Char st Inline
-emDash = do
+emDashOld :: GenParser Char st Inline
+emDashOld = do
try (charOrRef "\8212\151") <|> (try $ string "--" >> optional (char '-') >> return '-')
return (Str "\8212")
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
index 4693bd06d..3b5954368 100644
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -68,7 +68,8 @@ import Control.Monad ( guard, liftM )
readTextile :: ParserState -- ^ Parser state, including options for parser
-> String -- ^ String to parse (assuming @'\n'@ line endings)
-> Pandoc
-readTextile state s = (readWith parseTextile) state (s ++ "\n\n")
+readTextile state s =
+ (readWith parseTextile) state{ stateOldDashes = True } (s ++ "\n\n")
--
diff --git a/src/pandoc.hs b/src/pandoc.hs
index fc28c4c3f..3660fc167 100644
--- a/src/pandoc.hs
+++ b/src/pandoc.hs
@@ -103,6 +103,7 @@ data Opt = Opt
, optSelfContained :: Bool -- ^ Make HTML accessible offline
, optXeTeX :: Bool -- ^ Format latex for xetex
, optSmart :: Bool -- ^ Use smart typography
+ , optOldDashes :: Bool -- ^ Parse dashes like pandoc <=1.8.2.1
, optHtml5 :: Bool -- ^ Produce HTML5 in HTML
, optHighlight :: Bool -- ^ Highlight source code
, optHighlightStyle :: Style -- ^ Style to use for highlighted code
@@ -149,6 +150,7 @@ defaultOpts = Opt
, optSelfContained = False
, optXeTeX = False
, optSmart = False
+ , optOldDashes = False
, optHtml5 = False
, optHighlight = True
, optHighlightStyle = pygments
@@ -245,6 +247,12 @@ options =
(\opt -> return opt { optSmart = True }))
"" -- "Use smart quotes, dashes, and ellipses"
+ , Option "" ["old-dashes"]
+ (NoArg
+ (\opt -> return opt { optSmart = True
+ , optOldDashes = True }))
+ "" -- "Use smart quotes, dashes, and ellipses"
+
, Option "5" ["html5"]
(NoArg
(\opt -> do
@@ -735,6 +743,7 @@ main = do
, optIncremental = incremental
, optSelfContained = selfContained
, optSmart = smart
+ , optOldDashes = oldDashes
, optHtml5 = html5
, optHighlight = highlight
, optHighlightStyle = highlightStyle
@@ -858,6 +867,7 @@ main = do
stateCitations = map CSL.refId refs,
stateSmart = smart || writerName' `elem`
["latex", "context", "latex+lhs", "beamer"],
+ stateOldDashes = oldDashes,
stateColumns = columns,
stateStrict = strict,
stateIndentedCodeClasses = codeBlockClasses,
diff --git a/tests/rst-reader.native b/tests/rst-reader.native
index 8d273a1d7..e0eb4d438 100644
--- a/tests/rst-reader.native
+++ b/tests/rst-reader.native
@@ -165,14 +165,14 @@ Pandoc (Meta {docTitle = [Str "Pandoc",Space,Str "Test",Space,Str "Suite",Str ":
,([Str "city"],
[[Para [Emph [Str "Nowhere"],Str ",",Space,Str "MA,",Space,Str "USA"]]])
,([Str "phone"],
- [[Para [Str "123",Str "\8211",Str "4567"]]])]]
+ [[Para [Str "123",Str "-",Str "4567"]]])]]
,DefinitionList
[([Str "address"],
[[Para [Str "61",Space,Str "Main",Space,Str "St",Str "."]]])
,([Str "city"],
[[Para [Emph [Str "Nowhere"],Str ",",Space,Str "MA,",Space,Str "USA"]]])
,([Str "phone"],
- [[Para [Str "123",Str "\8211",Str "4567"]]])]
+ [[Para [Str "123",Str "-",Str "4567"]]])]
,Header 1 [Str "HTML",Space,Str "Blocks"]
,Para [Str "Simple",Space,Str "block",Space,Str "on",Space,Str "one",Space,Str "line",Str ":"]
,RawBlock "html" "<div>foo</div>\n"
diff --git a/tests/testsuite.txt b/tests/testsuite.txt
index ccee0764a..3bb5d8cb5 100644
--- a/tests/testsuite.txt
+++ b/tests/testsuite.txt
@@ -492,9 +492,9 @@ So is 'pine.'
Here is some quoted '`code`' and a "[quoted link][1]".
-Some dashes: one---two --- three--four -- five.
+Some dashes: one---two --- three---four --- five.
-Dashes between numbers: 5-7, 255-66, 1987-1999.
+Dashes between numbers: 5--7, 255--66, 1987--1999.
Ellipses...and...and....