From e3abc2595f97962fd5158a1a7670309200cd1a28 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 13 Sep 2012 11:18:59 -0700 Subject: MediaWiki reader: Improved efficiency with raw html tags. Parse one tag, then use a case statement. --- src/Text/Pandoc/Readers/MediaWiki.hs | 137 +++++++++++++++-------------------- 1 file changed, 59 insertions(+), 78 deletions(-) (limited to 'src/Text/Pandoc/Readers/MediaWiki.hs') diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs index 4467d5499..5e742470c 100644 --- a/src/Text/Pandoc/Readers/MediaWiki.hs +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -30,8 +30,6 @@ Conversion of mediawiki text to 'Pandoc' document. -} {- TODO: -_ make tag parsers more efficient by parsing one HTML tag, checking it, - then... _ support HTML lists _ support list style attributes and start values in ol lists, also value attribute on li @@ -85,23 +83,27 @@ spaceChars = " \n\t" sym :: String -> MWParser () sym s = () <$ try (string s) +isBlockTag' :: Tag String -> Bool +isBlockTag' tag@(TagOpen t _) = isBlockTag tag || + t == "haskell" || t == "syntaxhighlight" +isBlockTag' tag@(TagClose t) = isBlockTag tag || + t == "haskell" || t == "syntaxhighlight" +isBlockTag' tag = isBlockTag tag + htmlComment :: MWParser () htmlComment = () <$ htmlTag isCommentTag inlinesInTags :: String -> MWParser Inlines inlinesInTags tag = trimInlines . mconcat <$> try - (htmlTag (~== TagOpen tag []) *> - manyTill inline (htmlTag (~== TagClose tag))) + (manyTill inline (htmlTag (~== TagClose tag))) blocksInTags :: String -> MWParser Blocks blocksInTags tag = mconcat <$> try - (htmlTag (~== TagOpen tag []) *> - manyTill block (htmlTag (~== TagClose tag))) + (manyTill block (htmlTag (~== TagClose tag))) charsInTags :: String -> MWParser [Char] charsInTags tag = innerText . parseTags <$> try - (htmlTag (~== TagOpen tag []) *> - manyTill anyChar (htmlTag (~== TagClose tag))) + (manyTill anyChar (htmlTag (~== TagClose tag))) -- -- main parser @@ -119,18 +121,15 @@ parseMediaWiki = do -- block :: MWParser Blocks -block = header +block = mempty <$ skipMany1 blankline + <|> header <|> hrule <|> bulletList <|> orderedList <|> definitionList <|> mempty <$ try (spaces *> htmlComment) <|> preformatted - <|> blockquote - <|> codeblock - <|> syntaxhighlight - <|> haskell - <|> mempty <$ skipMany1 blankline + <|> blockTag <|> pTag <|> blockHtml <|> para @@ -138,6 +137,32 @@ block = header para :: MWParser Blocks para = B.para . trimInlines . mconcat <$> many1 inline +blockTag :: MWParser Blocks +blockTag = do + (TagOpen t attrs, raw) <- htmlTag (\x -> isBlockTag' x && isTagOpen x) + case t of + "blockquote" -> B.blockQuote <$> blocksInTags "blockquote" + "pre" -> B.codeBlock . trimCode <$> charsInTags "pre" + "syntaxhighlight" -> syntaxhighlight attrs + "haskell" -> B.codeBlockWith ("",["haskell"],[]) . trimCode <$> + charsInTags "haskell" + "p" -> return mempty + _ -> return $ B.rawBlock "html" raw + +trimCode :: String -> String +trimCode ('\n':xs) = stripTrailingNewlines xs +trimCode xs = stripTrailingNewlines xs + +syntaxhighlight :: [Attribute String] -> MWParser Blocks +syntaxhighlight attrs = try $ do + let mblang = lookup "lang" attrs + let mbstart = lookup "start" attrs + let mbline = lookup "line" attrs + let classes = maybe [] (:[]) mblang ++ maybe [] (const ["numberLines"]) mbline + let kvs = maybe [] (\x -> [("startFrom",x)]) mbstart + contents <- charsInTags "syntaxhighlight" + return $ B.codeBlockWith ("",classes,kvs) $ trimCode contents + -- We can just skip pTags, as contents will be treated as paragraphs pTag :: MWParser Blocks pTag = mempty <$ (htmlTag (\t -> t ~== TagOpen "p" [] || t ~== TagClose "p")) @@ -164,32 +189,6 @@ preformatted = do strToCode x = x B.para . bottomUp strToCode . mconcat <$> many1 inline' -blockquote :: MWParser Blocks -blockquote = B.blockQuote <$> blocksInTags "blockquote" - -codeblock :: MWParser Blocks -codeblock = B.codeBlock . trimCode <$> charsInTags "pre" - -trimCode :: String -> String -trimCode ('\n':xs) = stripTrailingNewlines xs -trimCode xs = stripTrailingNewlines xs - -syntaxhighlight :: MWParser Blocks -syntaxhighlight = try $ do - (TagOpen _ attrs, _) <- lookAhead - $ htmlTag (~== TagOpen "syntaxhighlight" []) - let mblang = lookup "lang" attrs - let mbstart = lookup "start" attrs - let mbline = lookup "line" attrs - let classes = maybe [] (:[]) mblang ++ maybe [] (const ["numberLines"]) mbline - let kvs = maybe [] (\x -> [("startFrom",x)]) mbstart - contents <- charsInTags "syntaxhighlight" - return $ B.codeBlockWith ("",classes,kvs) $ trimCode contents - -haskell :: MWParser Blocks -haskell = B.codeBlockWith ("",["haskell"],[]) . trimCode <$> - charsInTags "haskell" - header :: MWParser Blocks header = try $ do col <- sourceColumn <$> getPosition @@ -271,15 +270,8 @@ inline = whitespace <|> str <|> strong <|> emph - <|> nowiki - <|> linebreak <|> externalLink - <|> strikeout - <|> subscript - <|> superscript - <|> math - <|> code - <|> hask + <|> inlineTag <|> B.singleton <$> charRef <|> inlineHtml <|> special @@ -287,8 +279,26 @@ inline = whitespace str :: MWParser Inlines str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars) +inlineTag :: MWParser Inlines +inlineTag = do + (TagOpen t _, raw) <- htmlTag (\x -> isInlineTag x && isTagOpen x) + case t of + "nowiki" -> B.text . fromEntities <$> try + (manyTill anyChar (htmlTag (~== TagClose "nowiki"))) + "br" -> B.linebreak <$ + (optional (htmlTag (~== TagClose "br")) *> optional blankline) + "strike" -> B.strikeout <$> inlinesInTags "strike" + "del" -> B.strikeout <$> inlinesInTags "del" + "sub" -> B.subscript <$> inlinesInTags "sub" + "sup" -> B.superscript <$> inlinesInTags "sup" + "math" -> B.math <$> charsInTags "math" + "code" -> B.code <$> charsInTags "code" + "tt" -> B.code <$> charsInTags "tt" + "hask" -> B.codeWith ("",["haskell"],[]) <$> charsInTags "hask" + _ -> return $ B.rawInline "html" raw + special :: MWParser Inlines -special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag) *> +special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag') *> oneOf specialChars) inlineHtml :: MWParser Inlines @@ -303,12 +313,6 @@ endline = () <$ try (newline <* notFollowedBy' hrule <* notFollowedBy anyListStart) -linebreak :: MWParser Inlines -linebreak = B.linebreak <$ - (htmlTag (~== TagOpen "br" []) *> - optional (htmlTag (~== TagClose "br")) *> - optional blankline) - externalLink :: MWParser Inlines externalLink = try $ do char '[' @@ -325,29 +329,6 @@ url = do (orig, src) <- uri return $ B.link src "" (B.str orig) -nowiki :: MWParser Inlines -nowiki = B.text . fromEntities <$> try - (htmlTag (~== TagOpen "nowiki" []) *> - manyTill anyChar (htmlTag (~== TagClose "nowiki"))) - -strikeout :: MWParser Inlines -strikeout = B.strikeout <$> (inlinesInTags "strike" <|> inlinesInTags "del") - -superscript :: MWParser Inlines -superscript = B.superscript <$> inlinesInTags "sup" - -subscript :: MWParser Inlines -subscript = B.subscript <$> inlinesInTags "sub" - -math :: MWParser Inlines -math = B.math <$> charsInTags "math" - -code :: MWParser Inlines -code = B.code <$> (charsInTags "code" <|> charsInTags "tt") - -hask :: MWParser Inlines -hask = B.codeWith ("",["haskell"],[]) <$> charsInTags "hask" - -- | Parses a list of inlines between start and end delimiters. inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines inlinesBetween start end = -- cgit v1.2.3