From 95f2726ee74a7770bf0eb5fe5629c493addbe298 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 5 Mar 2017 10:24:39 +0100 Subject: Added readerAbbreviations to ParserState. Markdown reader now consults this to determine what is an abbreviation. Eventually it will be possible to specify a custom list (see #256). --- src/Text/Pandoc/Options.hs | 11 ++++++++++ src/Text/Pandoc/Readers/Markdown.hs | 41 +++++++++++++++---------------------- 2 files changed, 27 insertions(+), 25 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs index 6db53c3dc..0379b0ddf 100644 --- a/src/Text/Pandoc/Options.hs +++ b/src/Text/Pandoc/Options.hs @@ -47,6 +47,7 @@ module Text.Pandoc.Options ( module Text.Pandoc.Extensions ) where import Data.Data (Data) import Data.Default +import qualified Data.Set as Set import Data.Typeable (Typeable) import GHC.Generics (Generic) import Text.Pandoc.Extensions @@ -60,6 +61,7 @@ data ReaderOptions = ReaderOptions{ , readerApplyMacros :: Bool -- ^ Apply macros to TeX math , readerIndentedCodeClasses :: [String] -- ^ Default classes for -- indented code blocks + , readerAbbreviations :: Set.Set String -- ^ Strings to treat as abbreviations , readerDefaultImageExtension :: String -- ^ Default extension for images , readerTrackChanges :: TrackChanges } deriving (Show, Read, Data, Typeable, Generic) @@ -72,10 +74,19 @@ instance Default ReaderOptions , readerTabStop = 4 , readerApplyMacros = True , readerIndentedCodeClasses = [] + , readerAbbreviations = defaultAbbrevs , readerDefaultImageExtension = "" , readerTrackChanges = AcceptChanges } +defaultAbbrevs :: Set.Set String +defaultAbbrevs = Set.fromList + [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.", + "Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.", + "vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.", + "Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.", + "ch.", "sec.", "cf.", "cp."] + -- -- Writer options -- diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 4790f83ff..abaa907e0 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -42,6 +42,7 @@ import Data.Maybe import Data.Monoid ((<>)) import Data.Ord (comparing) import Data.Scientific (base10Exponent, coefficient) +import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T import qualified Data.Vector as V @@ -1688,32 +1689,22 @@ nonEndline = satisfy (/='\n') str :: PandocMonad m => MarkdownParser m (F Inlines) str = do - result <- many1 alphaNum + result <- many1 (alphaNum <|> try (char '.' <* notFollowedBy (char '.'))) updateLastStrPos - let spacesToNbr = map (\c -> if c == ' ' then '\160' else c) - isSmart <- extensionEnabled Ext_smart <$> getOption readerExtensions - if isSmart - then case likelyAbbrev result of - [] -> return $ return $ B.str result - xs -> choice (map (\x -> - try (string x >> oneOf " \n" >> - lookAhead alphaNum >> - return (return $ B.str - $ result ++ spacesToNbr x ++ "\160"))) xs) - <|> (return $ return $ B.str result) - else return $ return $ B.str result - --- | if the string matches the beginning of an abbreviation (before --- the first period, return strings that would finish the abbreviation. -likelyAbbrev :: String -> [String] -likelyAbbrev x = - let abbrevs = [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.", - "Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.", - "vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.", - "Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.", - "ch.", "sec.", "cf.", "cp."] - abbrPairs = map (break (=='.')) abbrevs - in map snd $ filter (\(y,_) -> y == x) abbrPairs + (do guardEnabled Ext_smart + abbrevs <- getOption readerAbbreviations + if not (null result) && last result == '.' && result `Set.member` abbrevs + then try (do ils <- whitespace <|> endline + lookAhead alphaNum + return $ do + ils' <- ils + if ils' == B.space + then return (B.str result <> B.str "\160") + else -- linebreak or softbreak + return (ils' <> B.str result <> B.str "\160")) + <|> return (return (B.str result)) + else return (return (B.str result))) + <|> return (return (B.str result)) -- an endline character that can be treated as a space, not a structural break endline :: PandocMonad m => MarkdownParser m (F Inlines) -- cgit v1.2.3