aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2017-10-23 17:29:32 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2017-10-23 17:29:32 -0700
commit896803b0d5d1f5d680d125eb75913025fa734190 (patch)
tree18c316bc936de98eaf75323609fadf1b4165c0cf /src/Text/Pandoc/Readers/HTML.hs
parent1a82ecbb6866a00689e3220d304a0fafd81358bb (diff)
downloadpandoc-896803b0d5d1f5d680d125eb75913025fa734190.tar.gz
HTML reader: `htmlTag` improvements.
We previously failed on cases where an attribute contained a `>` character. This patch fixes the bug. Closes #3989.
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs27
1 files changed, 19 insertions, 8 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 4cbc03089..a545f3f3d 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -1125,9 +1125,15 @@ htmlTag :: (HasReaderOptions st, Monad m)
htmlTag f = try $ do
lookAhead (char '<')
inp <- getInput
- let (next : _) = canonicalizeTags $ parseTagsOptions
- parseOptions{ optTagWarning = False } inp
- guard $ f next
+ let ts = canonicalizeTags $ parseTagsOptions
+ parseOptions{ optTagWarning = False
+ , optTagPosition = True }
+ (inp ++ " ") -- add space to ensure that
+ -- we get a TagPosition after the tag
+ (next, ln, col) <- case ts of
+ (TagPosition{} : next : TagPosition ln col : _)
+ | f next -> return (next, ln, col)
+ _ -> mzero
-- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
-- should NOT be parsed as an HTML tag, see #2277,
@@ -1138,6 +1144,11 @@ htmlTag f = try $ do
[] -> False
(c:cs) -> isLetter c && all isNameChar cs
+ let endAngle = try $ do char '>'
+ pos <- getPosition
+ guard $ (sourceLine pos == ln &&
+ sourceColumn pos >= col) ||
+ sourceLine pos > ln
let handleTag tagname = do
-- basic sanity check, since the parser is very forgiving
-- and finds tags in stuff like x<y)
@@ -1146,14 +1157,14 @@ htmlTag f = try $ do
-- <https://example.org> should NOT be a tag either.
-- tagsoup will parse it as TagOpen "https:" [("example.org","")]
guard $ last tagname /= ':'
- rendered <- manyTill anyChar (char '>')
- return (next, rendered <> ">")
+ char '<'
+ rendered <- manyTill anyChar endAngle
+ return (next, "<" ++ rendered ++ ">")
case next of
TagComment s
| "<!--" `isPrefixOf` inp -> do
- count (length s + 4) anyChar
- skipMany (satisfy (/='>'))
- char '>'
+ char '<'
+ manyTill anyChar endAngle
stripComments <- getOption readerStripComments
if stripComments
then return (next, "")