From 773bbb8fc73a3b6598188dbae64a841eb6680b38 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 10 Apr 2016 07:39:36 -0700 Subject: Markdown + HTML readers: be more forgiving about unescaped &. We are now more forgiving about parsing invalid HTML with unescaped `&` as raw HTML. (Previously any unescaped `&` would cause pandoc not to recognize the string as raw HTML.) Closes #2410. --- src/Text/Pandoc/Readers/HTML.hs | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index fb936cff7..8ee5da543 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -971,11 +971,20 @@ htmlTag :: Monad m htmlTag f = try $ do lookAhead (char '<') inp <- getInput - let (next : rest) = canonicalizeTags $ parseTagsOptions - parseOptions{ optTagWarning = True } inp + let (next : _) = canonicalizeTags $ parseTagsOptions + parseOptions{ optTagWarning = False } inp guard $ f next + let handleTag tagname = do + -- + -- should NOT be parsed as an HTML tag, see #2277 + guard $ not ('.' `elem` tagname) + -- should NOT be a tag either. + -- tagsoup will parse it as TagOpen "https:" [("example.org","")] + guard $ not (null tagname) + guard $ last tagname /= ':' + rendered <- manyTill anyChar (char '>') + return (next, rendered ++ ">") case next of - TagWarning _ -> fail "encountered TagWarning" TagComment s | "") | otherwise -> fail "bogus comment mode, HTML5 parse error" - _ -> do - -- we get a TagWarning on things like - -- - -- which should NOT be parsed as an HTML tag, see #2277 - guard $ not $ hasTagWarning rest - rendered <- manyTill anyChar (char '>') - return (next, rendered ++ ">") + TagOpen tagname _attr -> handleTag tagname + TagClose tagname -> handleTag tagname + _ -> mzero mkAttr :: [(String, String)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) -- cgit v1.2.3