Markdown reader: Improved emph/strong parsing.

Ported code from pandoc2. Now all tests pass.
author: John MacFarlane <jgm@berkeley.edu> 2011-07-30 18:08:49 -0700
committer: John MacFarlane <jgm@berkeley.edu> 2011-07-30 18:08:49 -0700
commit: b66b7a791ca87d6afccb6e44cdadca158ced5d4c (patch)
tree: 94489773cd98b5f0c6acb350ebf79ae63a30129b /src/Text/Pandoc/Readers/Markdown.hs
parent: 2d14c9b4363b827604bead5c3a378630087d8a9d (diff)
download: pandoc-b66b7a791ca87d6afccb6e44cdadca158ced5d4c.tar.gz
1 files changed, 34 insertions, 13 deletions
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 6f51dfd9a..26721cf5c 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -44,6 +44,7 @@ import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockT
 import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
 import Text.ParserCombinators.Parsec
 import Control.Monad (when, liftM, guard)
+import Control.Applicative ((<$>), (*>), (<*))
 import Text.HTML.TagSoup
 import Text.HTML.TagSoup.Match (tagOpen)
 
@@ -907,7 +908,7 @@ inlineParsers = [ whitespace
                 , str
                 , endline
                 , code
-                , (fourOrMore '*' <|> fourOrMore '_')
+                , fours
                 , strong
                 , emph
                 , note
@@ -1018,24 +1019,44 @@ mathInline = try $ do
 
 -- to avoid performance problems, treat 4 or more _ or * in a row as a literal
 -- rather than attempting to parse for emph/strong
-fourOrMore :: Char -> GenParser Char st Inline
-fourOrMore c = try $ count 4 (char c) >> many (char c) >>= \s ->
-                       return (Str $ replicate 4 c ++ s)
+fours :: GenParser Char st Inline
+fours = try $ do
+  x <- char '*' <|> char '_'
+  count 2 $ satisfy (==x)
+  rest <- many1 (satisfy (==x))
+  return $ Str (x:x:x:rest)
+
+-- | Parses a list of inlines between start and end delimiters.
+inlinesBetween :: (Show b)
+               => GenParser Char ParserState a
+               -> GenParser Char ParserState b
+               -> GenParser Char ParserState [Inline]
+inlinesBetween start end =
+  normalizeSpaces <$> try (start *> many1Till inner end)
+    where inner      = innerSpace <|> (notFollowedBy' whitespace *> inline)
+          innerSpace = try $ whitespace <* notFollowedBy' end
 
 emph :: GenParser Char ParserState Inline
-emph = ((enclosed (char '*') (notFollowedBy' strong >> char '*') inline) <|>
-        (enclosed (char '_') (notFollowedBy' strong >> char '_' >> 
-                              notFollowedBy alphaNum) inline)) >>= 
-        return . Emph . normalizeSpaces
+emph = Emph <$>
+  (inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd)
+    where starStart = char '*' *> lookAhead nonspaceChar
+          starEnd   = notFollowedBy' strong *> char '*'
+          ulStart   = char '_' *> lookAhead nonspaceChar
+          ulEnd     = notFollowedBy' strong *> char '_'
 
 strong :: GenParser Char ParserState Inline
-strong = ((enclosed (string "**") (try $ string "**") inline) <|> 
-          (enclosed (string "__") (try $ string "__") inline)) >>=
-         return . Strong . normalizeSpaces
+strong = Strong <$>
+  (inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd)
+    where starStart = string "**" *> lookAhead nonspaceChar
+          starEnd   = try $ string "**"
+          ulStart   = string "__" *> lookAhead nonspaceChar
+          ulEnd     = try $ string "__"
 
 strikeout :: GenParser Char ParserState Inline
-strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>=
-            return . Strikeout . normalizeSpaces
+strikeout = Strikeout <$> (failIfStrict >> inlinesBetween strikeStart strikeEnd)
+  where strikeStart = string "~~" *> lookAhead nonspaceChar
+                      *> notFollowedBy (char '~')
+        strikeEnd   = try $ string "~~"
 
 superscript :: GenParser Char ParserState Inline
 superscript = failIfStrict >> enclosed (char '^') (char '^')
author	John MacFarlane <jgm@berkeley.edu>	2011-07-30 18:08:49 -0700
committer	John MacFarlane <jgm@berkeley.edu>	2011-07-30 18:08:49 -0700
commit	b66b7a791ca87d6afccb6e44cdadca158ced5d4c (patch)
tree	94489773cd98b5f0c6acb350ebf79ae63a30129b /src/Text/Pandoc/Readers/Markdown.hs
parent	2d14c9b4363b827604bead5c3a378630087d8a9d (diff)
download	pandoc-b66b7a791ca87d6afccb6e44cdadca158ced5d4c.tar.gz