Added readerAbbreviations to ParserState.

Markdown reader now consults this to determine what is an abbreviation. Eventually it will be possible to specify a custom list (see #256).
author: John MacFarlane <jgm@berkeley.edu> 2017-03-05 10:24:39 +0100
committer: John MacFarlane <jgm@berkeley.edu> 2017-03-05 10:24:39 +0100
commit: 95f2726ee74a7770bf0eb5fe5629c493addbe298 (patch)
tree: 9970c4febe033118e910e465e94aca9c41f34771 /src/Text/Pandoc
parent: 7fc6919f9039a3c71028b807d4372f18cf35bee8 (diff)
download: pandoc-95f2726ee74a7770bf0eb5fe5629c493addbe298.tar.gz
2 files changed, 27 insertions, 25 deletions
diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs
index 6db53c3dc..0379b0ddf 100644
--- a/src/Text/Pandoc/Options.hs
+++ b/src/Text/Pandoc/Options.hs
@@ -47,6 +47,7 @@ module Text.Pandoc.Options ( module Text.Pandoc.Extensions
                            ) where
 import Data.Data (Data)
 import Data.Default
+import qualified Data.Set as Set
 import Data.Typeable (Typeable)
 import GHC.Generics (Generic)
 import Text.Pandoc.Extensions
@@ -60,6 +61,7 @@ data ReaderOptions = ReaderOptions{
        , readerApplyMacros           :: Bool -- ^ Apply macros to TeX math
        , readerIndentedCodeClasses   :: [String] -- ^ Default classes for
                                        -- indented code blocks
+       , readerAbbreviations         :: Set.Set String -- ^ Strings to treat as abbreviations
        , readerDefaultImageExtension :: String -- ^ Default extension for images
        , readerTrackChanges          :: TrackChanges
 } deriving (Show, Read, Data, Typeable, Generic)
@@ -72,10 +74,19 @@ instance Default ReaderOptions
                , readerTabStop               = 4
                , readerApplyMacros           = True
                , readerIndentedCodeClasses   = []
+               , readerAbbreviations         = defaultAbbrevs
                , readerDefaultImageExtension = ""
                , readerTrackChanges          = AcceptChanges
                }
 
+defaultAbbrevs :: Set.Set String
+defaultAbbrevs = Set.fromList
+                 [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.",
+                   "Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.",
+                   "vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.",
+                   "Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.",
+                   "ch.", "sec.", "cf.", "cp."]
+
 --
 -- Writer options
 --
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 4790f83ff..abaa907e0 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -42,6 +42,7 @@ import Data.Maybe
 import Data.Monoid ((<>))
 import Data.Ord (comparing)
 import Data.Scientific (base10Exponent, coefficient)
+import qualified Data.Set as Set
 import Data.Text (Text)
 import qualified Data.Text as T
 import qualified Data.Vector as V
@@ -1688,32 +1689,22 @@ nonEndline = satisfy (/='\n')
 
 str :: PandocMonad m => MarkdownParser m (F Inlines)
 str = do
-  result <- many1 alphaNum
+  result <- many1 (alphaNum <|> try (char '.' <* notFollowedBy (char '.')))
   updateLastStrPos
-  let spacesToNbr = map (\c -> if c == ' ' then '\160' else c)
-  isSmart <- extensionEnabled Ext_smart <$> getOption readerExtensions
-  if isSmart
-     then case likelyAbbrev result of
-               []        -> return $ return $ B.str result
-               xs        -> choice (map (\x ->
-                               try (string x >> oneOf " \n" >>
-                                    lookAhead alphaNum >>
-                                    return (return $ B.str
-                                                  $ result ++ spacesToNbr x ++ "\160"))) xs)
-                           <|> (return $ return $ B.str result)
-     else return $ return $ B.str result
-
--- | if the string matches the beginning of an abbreviation (before
--- the first period, return strings that would finish the abbreviation.
-likelyAbbrev :: String -> [String]
-likelyAbbrev x =
-  let abbrevs = [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.",
-                  "Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.",
-                  "vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.",
-                  "Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.",
-                  "ch.", "sec.", "cf.", "cp."]
-      abbrPairs = map (break (=='.')) abbrevs
-  in  map snd $ filter (\(y,_) -> y == x) abbrPairs
+  (do guardEnabled Ext_smart
+      abbrevs <- getOption readerAbbreviations
+      if not (null result) && last result == '.' && result `Set.member` abbrevs
+         then try (do ils <- whitespace <|> endline
+                      lookAhead alphaNum
+                      return $ do
+                        ils' <- ils
+                        if ils' == B.space
+                           then return (B.str result <> B.str "\160")
+                           else -- linebreak or softbreak
+                                return (ils' <> B.str result <> B.str "\160"))
+                <|> return (return (B.str result))
+         else return (return (B.str result)))
+     <|> return (return (B.str result))
 
 -- an endline character that can be treated as a space, not a structural break
 endline :: PandocMonad m => MarkdownParser m (F Inlines)
author	John MacFarlane <jgm@berkeley.edu>	2017-03-05 10:24:39 +0100
committer	John MacFarlane <jgm@berkeley.edu>	2017-03-05 10:24:39 +0100
commit	95f2726ee74a7770bf0eb5fe5629c493addbe298 (patch)
tree	9970c4febe033118e910e465e94aca9c41f34771 /src/Text/Pandoc
parent	7fc6919f9039a3c71028b807d4372f18cf35bee8 (diff)
download	pandoc-95f2726ee74a7770bf0eb5fe5629c493addbe298.tar.gz