From 896803b0d5d1f5d680d125eb75913025fa734190 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 23 Oct 2017 17:29:32 -0700 Subject: HTML reader: `htmlTag` improvements. We previously failed on cases where an attribute contained a `>` character. This patch fixes the bug. Closes #3989. --- src/Text/Pandoc/Readers/HTML.hs | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 4cbc03089..a545f3f3d 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1125,9 +1125,15 @@ htmlTag :: (HasReaderOptions st, Monad m) htmlTag f = try $ do lookAhead (char '<') inp <- getInput - let (next : _) = canonicalizeTags $ parseTagsOptions - parseOptions{ optTagWarning = False } inp - guard $ f next + let ts = canonicalizeTags $ parseTagsOptions + parseOptions{ optTagWarning = False + , optTagPosition = True } + (inp ++ " ") -- add space to ensure that + -- we get a TagPosition after the tag + (next, ln, col) <- case ts of + (TagPosition{} : next : TagPosition ln col : _) + | f next -> return (next, ln, col) + _ -> mzero -- -- should NOT be parsed as an HTML tag, see #2277, @@ -1138,6 +1144,11 @@ htmlTag f = try $ do [] -> False (c:cs) -> isLetter c && all isNameChar cs + let endAngle = try $ do char '>' + pos <- getPosition + guard $ (sourceLine pos == ln && + sourceColumn pos >= col) || + sourceLine pos > ln let handleTag tagname = do -- basic sanity check, since the parser is very forgiving -- and finds tags in stuff like x should NOT be a tag either. -- tagsoup will parse it as TagOpen "https:" [("example.org","")] guard $ last tagname /= ':' - rendered <- manyTill anyChar (char '>') - return (next, rendered <> ">") + char '<' + rendered <- manyTill anyChar endAngle + return (next, "<" ++ rendered ++ ">") case next of TagComment s | "