diff options
author | John MacFarlane <fiddlosopher@gmail.com> | 2011-12-29 23:44:12 -0800 |
---|---|---|
committer | John MacFarlane <fiddlosopher@gmail.com> | 2011-12-29 23:44:12 -0800 |
commit | 925a4c5164026bfda25bf50b552bacec074fdf3f (patch) | |
tree | 2ddf5a2cd3530789434697d19c0bf8fd892d0f9f /src/Text/Pandoc | |
parent | 600c22e7bf57b219467794cd8e37c19571988462 (diff) | |
download | pandoc-925a4c5164026bfda25bf50b552bacec074fdf3f.tar.gz |
Better smart quote parsing.
* Added stateLastStrPos to ParserState. This lets us keep track
of whether we're parsing the position immediately after a 'str'.
If we encounter a ' in such a location, it must be an apostrophe,
and can't be a single quote start.
* Set this in the markdown, textile, html, and rst str parsers.
* Closes #360.
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r-- | src/Text/Pandoc/Parsing.hs | 8 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 8 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/RST.hs | 6 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Textile.hs | 2 |
5 files changed, 22 insertions, 4 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 5fa375ca6..c2c512033 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -603,6 +603,7 @@ data ParserState = ParserState { stateParseRaw :: Bool, -- ^ Parse raw HTML and LaTeX? stateParserContext :: ParserContext, -- ^ Inside list? stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment? + stateLastStrPos :: Maybe SourcePos, -- ^ Position after last str parsed stateKeys :: KeyTable, -- ^ List of reference keys stateCitations :: [String], -- ^ List of available citations stateNotes :: NoteTable, -- ^ List of notes @@ -630,6 +631,7 @@ defaultParserState = ParserState { stateParseRaw = False, stateParserContext = NullState, stateQuoteContext = NoQuote, + stateLastStrPos = Nothing, stateKeys = M.empty, stateCitations = [], stateNotes = [], @@ -751,8 +753,12 @@ charOrRef cs = return c) singleQuoteStart :: GenParser Char ParserState () -singleQuoteStart = do +singleQuoteStart = do failIfInQuoteContext InSingleQuote + pos <- getPosition + st <- getState + -- single quote start can't be right after str + guard $ stateLastStrPos st /= Just pos try $ do charOrRef "'\8216\145" notFollowedBy (oneOf ")!],;:-? \t\n") notFollowedBy (char '.') <|> lookAhead (string "..." >> return ()) diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 70ea24680..179065413 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -421,8 +421,12 @@ pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad pStr :: GenParser Char ParserState Inline -pStr = liftM Str $ many1 $ satisfy $ \c -> - not (isSpace c) && not (isSpecial c) && not (isBad c) +pStr = do + result <- many1 $ satisfy $ \c -> + not (isSpace c) && not (isSpecial c) && not (isBad c) + pos <- getPosition + updateState $ \s -> s{ stateLastStrPos = Just pos } + return $ Str result isSpecial :: Char -> Bool isSpecial '"' = True diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index db68df629..d854bd3c7 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1096,6 +1096,8 @@ str = do lookAhead alphaNum >> return '\x2019') -- for things like l'aide else mzero + pos <- getPosition + updateState $ \s -> s{ stateLastStrPos = Just pos } let result = a:as let spacesToNbr = map (\c -> if c == ' ' then '\160' else c) if smart diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index 3dcfe47d0..d8704d8c9 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -791,7 +791,11 @@ whitespace :: GenParser Char ParserState Inline whitespace = many1 spaceChar >> return Space <?> "whitespace" str :: GenParser Char ParserState Inline -str = many1 (noneOf (specialChars ++ "\t\n ")) >>= return . Str +str = do + result <- many1 (noneOf (specialChars ++ "\t\n ")) + pos <- getPosition + updateState $ \s -> s{ stateLastStrPos = Just pos } + return $ Str result -- an endline character that can be treated as a space, not a structural break endline :: GenParser Char ParserState Inline diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index 12d299aa4..4693bd06d 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -436,6 +436,8 @@ str = do next <- lookAhead letter guard $ isLetter (last xs) || isLetter next return $ xs ++ "-" + pos <- getPosition + updateState $ \s -> s{ stateLastStrPos = Just pos } return $ Str result -- | Textile allows HTML span infos, we discard them |