aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-06-21 22:35:07 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2021-06-21 22:35:07 -0700
commit0352f7845bfa2053797850c3639414978285b63e (patch)
treee25772594c18c114debc2ebf3d94160855498192 /src
parent2ef2049b4e94dc51961e75edb27af1d2f83acd3b (diff)
downloadpandoc-0352f7845bfa2053797850c3639414978285b63e.tar.gz
Improve emailAddress in Text.Pandoc.Parsing.
Previously the parser would accept characters in domains that are illegal in domains, and this sometimes caused it to gobble bits of the following text. Closes #7398. Note that this change, by itself, caused some txt2tag reader tests to fail. txt2tags allows bare email addresses with a following form query. So, in addition to the change to emailAddress, we modify the txt2tags parser so it can still handle these cases.
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Parsing.hs7
-rw-r--r--src/Text/Pandoc/Readers/Txt2Tags.hs22
2 files changed, 24 insertions, 5 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 0bb794ba1..082d9565b 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -693,13 +693,12 @@ emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain)
mailbox = intercalate "." <$> (emailWord `sepBy1'` dot)
domain = intercalate "." <$> (subdomain `sepBy1'` dot)
dot = char '.'
- subdomain = many1 $ alphaNum <|> innerPunct
+ subdomain = many1 $ alphaNum <|> innerPunct (=='-')
-- this excludes some valid email addresses, since an
-- email could contain e.g. '__', but gives better results
-- for our purposes, when combined with markdown parsing:
- innerPunct = try (satisfy (\c -> isEmailPunct c || c == '@')
- <* notFollowedBy space
- <* notFollowedBy (satisfy isPunctuation))
+ innerPunct f = try (satisfy f
+ <* notFollowedBy (satisfy (not . isAlphaNum)))
-- technically an email address could begin with a symbol,
-- but allowing this creates too many problems.
-- See e.g. https://github.com/jgm/pandoc/issues/2940
diff --git a/src/Text/Pandoc/Readers/Txt2Tags.hs b/src/Text/Pandoc/Readers/Txt2Tags.hs
index 6f92f0063..b5cf5a0f3 100644
--- a/src/Text/Pandoc/Readers/Txt2Tags.hs
+++ b/src/Text/Pandoc/Readers/Txt2Tags.hs
@@ -478,9 +478,29 @@ macro = try $ do
-- raw URLs in text are automatically linked
url :: T2T Inlines
url = try $ do
- (rawUrl, escapedUrl) <- try uri <|> emailAddress
+ (rawUrl, escapedUrl) <- try uri <|> emailAddress'
return $ B.link rawUrl "" (B.str escapedUrl)
+emailAddress' :: T2T (Text, Text)
+emailAddress' = do
+ (base, mailURI) <- emailAddress
+ query <- option "" emailQuery
+ return (base <> query, mailURI <> query)
+
+emailQuery :: T2T Text
+emailQuery = do
+ char '?'
+ parts <- kv `sepBy1` (char '&')
+ return $ "?" <> T.intercalate "&" parts
+
+kv :: T2T Text
+kv = do
+ k <- T.pack <$> many1 alphaNum
+ char '='
+ let vchar = alphaNum <|> try (oneOf "%._/~:,=$@&+-;*" <* lookAhead alphaNum)
+ v <- T.pack <$> many1 vchar
+ return (k <> "=" <> v)
+
uri :: T2T (Text, Text)
uri = try $ do
address <- t2tURI