aboutsummaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2009-04-30 04:39:45 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2009-04-30 04:39:45 +0000
commitab100f7c5e5174620e8cff8dc965be68a1f3ba5a (patch)
treea6b84b4c6dead7799c0eacc9d4cf8b5c335774a2 /src/Text
parent490c4304f5d249ae1fa045d15baf473386e4fc65 (diff)
downloadpandoc-ab100f7c5e5174620e8cff8dc965be68a1f3ba5a.tar.gz
Markdown reader: improved efficiency of abbreviation parsing.
Instead of a separate abbrev parser, we just check for abbreviations each time we parse a string. This gives a huge performance boost with -S. Resolves Issue #141. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1570 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs55
1 files changed, 26 insertions, 29 deletions
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index be9c4cfd5..6300d048a 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -34,7 +34,7 @@ module Text.Pandoc.Readers.Markdown (
import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy, findIndex, intercalate )
import Data.Ord ( comparing )
-import Data.Char ( isAlphaNum, isAlpha, isLower, isDigit, isUpper )
+import Data.Char ( isAlphaNum, isUpper )
import Data.Maybe
import Text.Pandoc.Definition
import Text.Pandoc.Shared
@@ -820,8 +820,7 @@ inline :: GenParser Char ParserState Inline
inline = choice inlineParsers <?> "inline"
inlineParsers :: [GenParser Char ParserState Inline]
-inlineParsers = [ abbrev
- , str
+inlineParsers = [ str
, smartPunctuation
, whitespace
, endline
@@ -944,30 +943,6 @@ subscript = failIfStrict >> enclosed (char '~') (char '~')
(notFollowedBy spaceChar >> inline) >>= -- may not contain Space
return . Subscript
-abbrev :: GenParser Char ParserState Inline
-abbrev = failUnlessSmart >>
- (assumedAbbrev <|> knownAbbrev) >>= return . Str . (++ ".\160")
-
--- an string of letters followed by a period that does not end a sentence
--- is assumed to be an abbreviation. It is assumed that sentences don't
--- start with lowercase letters or numerals.
-assumedAbbrev :: GenParser Char ParserState [Char]
-assumedAbbrev = try $ do
- result <- many1 $ satisfy isAlpha
- string ". "
- lookAhead $ satisfy (\x -> isLower x || isDigit x)
- return result
-
--- these strings are treated as abbreviations even if they are followed
--- by a capital letter (such as a name).
-knownAbbrev :: GenParser Char ParserState [Char]
-knownAbbrev = try $ do
- result <- oneOfStrings [ "Mr", "Mrs", "Ms", "Capt", "Dr", "Prof", "Gen",
- "Gov", "e.g", "i.e", "Sgt", "St", "vol", "vs",
- "Sen", "Rep", "Pres", "Hon", "Rev" ]
- string ". "
- return result
-
smartPunctuation :: GenParser Char ParserState Inline
smartPunctuation = failUnlessSmart >>
choice [ quoted, apostrophe, dash, ellipses ]
@@ -1060,8 +1035,30 @@ nonEndline = satisfy (/='\n')
strChar :: GenParser Char st Char
strChar = noneOf (specialChars ++ " \t\n")
-str :: GenParser Char st Inline
-str = many1 strChar >>= return . Str
+str :: GenParser Char ParserState Inline
+str = do
+ result <- many1 strChar
+ state <- getState
+ if stateSmart state
+ then case likelyAbbrev result of
+ [] -> return $ Str result
+ xs -> choice (map (\x ->
+ try (string x >> char ' ' >>
+ notFollowedBy spaceChar >>
+ return (Str $ result ++ x ++ "\160"))) xs)
+ <|> (return $ Str result)
+ else return $ Str result
+
+-- | if the string matches the beginning of an abbreviation (before
+-- the first period, return strings that would finish the abbreviation.
+likelyAbbrev :: String -> [String]
+likelyAbbrev x =
+ let abbrevs = [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.",
+ "Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.",
+ "vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.",
+ "Rev.", "Ph.D.", "M.D.", "M.A." ]
+ abbrPairs = map (break (=='.')) abbrevs
+ in map snd $ filter (\(y,_) -> y == x) abbrPairs
-- an endline character that can be treated as a space, not a structural break
endline :: GenParser Char ParserState Inline