From 824bb2d22e40e035703ccf6ec7fd6bcde51950ce Mon Sep 17 00:00:00 2001
From: fiddlosopher
Date: Fri, 11 Jul 2008 02:14:57 +0000
Subject: In smart mode, use nonbreaking spaces after abbreviations in markdown
parser. Thus, for example, "Mr. Brown" comes out as "Mr.~Brown" in LaTeX, and
does not produce a sentence-separating space. Resolves Issue #75.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@1298 788f1e2b-df1e-0410-8736-df70ead52e1b
---
README | 9 +++++----
Text/Pandoc/Readers/Markdown.hs | 25 +++++++++++++++++++++++--
tests/testsuite.native | 2 +-
tests/writer.context | 2 +-
tests/writer.docbook | 2 +-
tests/writer.html | 2 +-
tests/writer.latex | 2 +-
tests/writer.man | 2 +-
tests/writer.markdown | 2 +-
tests/writer.native | 2 +-
tests/writer.rst | 2 +-
tests/writer.rtf | 2 +-
tests/writer.texinfo | 2 +-
13 files changed, 39 insertions(+), 17 deletions(-)
diff --git a/README b/README
index 43ecaeb6a..8352d7591 100644
--- a/README
+++ b/README
@@ -316,10 +316,11 @@ For further documentation, see the `pandoc(1)` man page.
`-S` or `--smart`
: causes `pandoc` to produce typographically correct output, along the
lines of John Gruber's [Smartypants]. Straight quotes are converted
- to curly quotes, `---` to dashes, and `...` to ellipses. (Note: This
- option is only significant when the input format is `markdown`.
- It is selected automatically when the output format is `latex` or
- `context`.)
+ to curly quotes, `---` to dashes, and `...` to ellipses. Nonbreaking
+ spaces are inserted after certain abbreviations, such as "Mr."
+ (Note: This option is only significant when the input format is
+ `markdown`. It is selected automatically when the output format is
+ `latex` or `context`.)
`-m`*[url]* or `--asciimathml`*[=url]*
: causes `pandoc` to use Peter Jipsen's [ASCIIMathML] script to display
diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs
index c9fbbe2d9..d0d99b607 100644
--- a/Text/Pandoc/Readers/Markdown.hs
+++ b/Text/Pandoc/Readers/Markdown.hs
@@ -33,7 +33,7 @@ module Text.Pandoc.Readers.Markdown (
import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy, findIndex )
import Data.Ord ( comparing )
-import Data.Char ( isAlphaNum )
+import Data.Char ( isAlphaNum, isAlpha, isLower, isDigit )
import Data.Maybe ( fromMaybe )
import Text.Pandoc.Definition
import Text.Pandoc.Shared
@@ -697,7 +697,8 @@ table = simpleTable <|> multilineTable > "table"
inline = choice inlineParsers > "inline"
-inlineParsers = [ str
+inlineParsers = [ abbrev
+ , str
, smartPunctuation
, whitespace
, endline
@@ -792,6 +793,26 @@ subscript = failIfStrict >> enclosed (char '~') (char '~')
(notFollowedBy' whitespace >> inline) >>= -- may not contain Space
return . Subscript
+abbrev = failUnlessSmart >>
+ (assumedAbbrev <|> knownAbbrev) >>= return . Str . (++ ".\160")
+
+-- an string of letters followed by a period that does not end a sentence
+-- is assumed to be an abbreviation. It is assumed that sentences don't
+-- start with lowercase letters or numerals.
+assumedAbbrev = try $ do
+ result <- many1 $ satisfy isAlpha
+ string ". "
+ lookAhead $ satisfy (\x -> isLower x || isDigit x)
+ return result
+
+-- these strings are treated as abbreviations even if they are followed
+-- by a capital letter (such as a name).
+knownAbbrev = try $ do
+ result <- oneOfStrings [ "Mr", "Mrs", "Ms", "Capt", "Dr", "Prof", "Gen",
+ "Gov", "e.g", "i.e", "Sgt", "St", "vol", "vs" ]
+ string ". "
+ return result
+
smartPunctuation = failUnlessSmart >>
choice [ quoted, apostrophe, dash, ellipses ]
diff --git a/tests/testsuite.native b/tests/testsuite.native
index d3c4835ed..4ecc51438 100644
--- a/tests/testsuite.native
+++ b/tests/testsuite.native
@@ -165,7 +165,7 @@ Pandoc (Meta [Str "Pandoc",Space,Str "Test",Space,Str "Suite"] ["John MacFarlane
[ [ Plain [Str "Nested",Str "."] ]
] ] ]
, Para [Str "Should",Space,Str "not",Space,Str "be",Space,Str "a",Space,Str "list",Space,Str "item:"]
-, Para [Str "M",Str ".",Str "A",Str ".",Space,Str "2007"]
+, Para [Str "M",Str ".",Str "A.\160",Str "2007"]
, Para [Str "B",Str ".",Space,Str "Williams"]
, HorizontalRule
, Header 1 [Str "Definition",Space,Str "Lists"]
diff --git a/tests/writer.context b/tests/writer.context
index 7c9678636..e13a906dd 100644
--- a/tests/writer.context
+++ b/tests/writer.context
@@ -444,7 +444,7 @@ Autonumbering:
Should not be a list item:
-M.A. 2007
+M.A.~2007
B. Williams
diff --git a/tests/writer.docbook b/tests/writer.docbook
index ad774f685..4860e7d66 100644
--- a/tests/writer.docbook
+++ b/tests/writer.docbook
@@ -606,7 +606,7 @@ These should not be escaped: \$ \\ \> \[ \{
Should not be a list item:
- M.A. 2007
+ M.A. 2007
B. Williams
diff --git a/tests/writer.html b/tests/writer.html
index 49bf3a691..819917c57 100644
--- a/tests/writer.html
+++ b/tests/writer.html
@@ -412,7 +412,7 @@ These should not be escaped: \$ \\ \> \[ \{
>Should not be a list item:
M.A. 2007
M.A. 2007
B. Williams
\\[ \\\{\par}
{\pard \ql \f0 \sa0 \li360 \fi-360 2.\tx360\tab More.\par}
{\pard \ql \f0 \sa0 \li720 \fi-360 a.\tx360\tab Nested.\sa180\sa180\par}
{\pard \ql \f0 \sa180 \li0 \fi0 Should not be a list item:\par}
-{\pard \ql \f0 \sa180 \li0 \fi0 M.A. 2007\par}
+{\pard \ql \f0 \sa180 \li0 \fi0 M.A.\u160?2007\par}
{\pard \ql \f0 \sa180 \li0 \fi0 B. Williams\par}
{\pard \qc \f0 \sa180 \li0 \fi0 \emdash\emdash\emdash\emdash\emdash\par}
{\pard \ql \f0 \sa180 \li0 \fi0 \b \fs36 Definition Lists\par}
diff --git a/tests/writer.texinfo b/tests/writer.texinfo
index bb2ced1ba..224aa2091 100644
--- a/tests/writer.texinfo
+++ b/tests/writer.texinfo
@@ -506,7 +506,7 @@ Nested.
Should not be a list item:
-M.A. 2007
+M.A.@ 2007
B. Williams
--
cgit v1.2.3