aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2016-02-20 14:54:19 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2016-02-20 15:00:31 -0800
commit1534052dd9c0336da149b79440c7d46714592e31 (patch)
tree6f6cba34199223e1844df3fff3ec16f670d53b95 /src/Text/Pandoc
parentd45fcf9f6dcc519a1a9e94502aa69cbc592cc5e6 (diff)
downloadpandoc-1534052dd9c0336da149b79440c7d46714592e31.tar.gz
HTML reader: rewrote htmlInBalanced.
This version avoids an exponential performance problem with `<script>` tags, and it should be faster in general. Closes #2730.
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs49
1 files changed, 39 insertions, 10 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 23add159e..03b790d0b 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -930,14 +930,45 @@ htmlInBalanced :: (Monad m)
=> (Tag String -> Bool)
-> ParserT String st m String
htmlInBalanced f = try $ do
- (TagOpen t _, tag) <- htmlTag f
- guard $ not $ "/>" `isSuffixOf` tag -- not a self-closing tag
- let stopper = htmlTag (~== TagClose t)
- let anytag = snd <$> htmlTag (const True)
- contents <- many $ notFollowedBy' stopper >>
- (htmlInBalanced f <|> anytag <|> count 1 anyChar)
- endtag <- liftM snd stopper
- return $ tag ++ concat contents ++ endtag
+ lookAhead (char '<')
+ inp <- getInput
+ let ts = canonicalizeTags $
+ parseTagsOptions parseOptions{ optTagWarning = True,
+ optTagPosition = True } inp
+ case ts of
+ (TagPosition sr sc : t@(TagOpen tn _) : rest) -> do
+ guard $ f t
+ guard $ not $ hasTagWarning (t : take 1 rest)
+ case htmlInBalanced' tn (t:rest) of
+ [] -> mzero
+ xs -> case reverse xs of
+ (TagClose _ : TagPosition er ec : _) -> do
+ let ls = er - sr
+ let cs = ec - sc
+ lscontents <- concat <$> count ls anyLine
+ cscontents <- count cs anyChar
+ (_,closetag) <- htmlTag (~== TagClose tn)
+ return (lscontents ++ cscontents ++ closetag)
+ _ -> mzero
+ _ -> mzero
+
+htmlInBalanced' :: String
+ -> [Tag String]
+ -> [Tag String]
+htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts
+ where go :: Int -> [Tag String] -> Maybe [Tag String]
+ go n (t@(TagOpen tn' _):rest) | tn' == tagname =
+ (t :) <$> go (n + 1) rest
+ go 1 (t@(TagClose tn'):_) | tn' == tagname =
+ return [t]
+ go n (t@(TagClose tn'):rest) | tn' == tagname =
+ (t :) <$> go (n - 1) rest
+ go n (t:ts') = (t :) <$> go n ts'
+ go n [] = mzero
+
+hasTagWarning :: [Tag String] -> Bool
+hasTagWarning (TagWarning _:_) = True
+hasTagWarning _ = False
-- | Matches a tag meeting a certain condition.
htmlTag :: Monad m
@@ -946,8 +977,6 @@ htmlTag :: Monad m
htmlTag f = try $ do
lookAhead (char '<')
inp <- getInput
- let hasTagWarning (TagWarning _:_) = True
- hasTagWarning _ = False
let (next : rest) = canonicalizeTags $ parseTagsOptions
parseOptions{ optTagWarning = True } inp
guard $ f next