{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 {- Copyright (C) 2012 John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | Module : Text.Pandoc.Readers.MediaWiki Copyright : Copyright (C) 2012 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane Stability : alpha Portability : portable Conversion of mediawiki text to 'Pandoc' document. -} {- TODO: _ support images http://www.mediawiki.org/wiki/Help:Images _ support tables http://www.mediawiki.org/wiki/Help:Tables - footnotes? -} module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>)) import Text.Pandoc.Options import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag, isCommentTag ) import Text.Pandoc.XML ( fromEntities ) import Text.Pandoc.Parsing hiding ( nested ) import Text.Pandoc.Generic ( bottomUp ) import Text.Pandoc.Shared ( stripTrailingNewlines, safeRead ) import Data.Monoid (mconcat, mempty) import Control.Applicative ((<$>), (<*), (*>), (<$)) import Control.Monad import Data.List (intersperse, intercalate ) import Text.HTML.TagSoup import Data.Sequence (viewl, ViewL(..), (<|)) -- | Read mediawiki from an input string and return a Pandoc document. readMediaWiki :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assuming @'\n'@ line endings) -> Pandoc readMediaWiki opts s = case runParser parseMediaWiki MWState{ mwOptions = opts , mwMaxNestingLevel = 4 , mwNextLinkNumber = 1 } "source" (s ++ "\n") of Left err' -> error $ "\nError:\n" ++ show err' Right result -> result data MWState = MWState { mwOptions :: ReaderOptions , mwMaxNestingLevel :: Int , mwNextLinkNumber :: Int } type MWParser = Parser [Char] MWState -- -- auxiliary functions -- -- This is used to prevent exponential blowups for things like: -- ''a'''a''a'''a''a'''a''a'''a nested :: MWParser a -> MWParser a nested p = do nestlevel <- mwMaxNestingLevel `fmap` getState guard $ nestlevel > 0 updateState $ \st -> st{ mwMaxNestingLevel = mwMaxNestingLevel st - 1 } res <- p updateState $ \st -> st{ mwMaxNestingLevel = nestlevel } return res specialChars :: [Char] specialChars = "'[]<=&*{}|" spaceChars :: [Char] spaceChars = " \n\t" sym :: String -> MWParser () sym s = () <$ try (string s) newBlockTags :: [String] newBlockTags = ["haskell","syntaxhighlight","gallery"] isBlockTag' :: Tag String -> Bool isBlockTag' tag@(TagOpen t _) = isBlockTag tag || t `elem` newBlockTags isBlockTag' tag@(TagClose t) = isBlockTag tag || t `elem` newBlockTags isBlockTag' tag = isBlockTag tag htmlComment :: MWParser () htmlComment = () <$ htmlTag isCommentTag inlinesInTags :: String -> MWParser Inlines inlinesInTags tag = try $ do (_,raw) <- htmlTag (~== TagOpen tag []) if '/' `elem` raw -- self-closing tag then return mempty else trimInlines . mconcat <$> manyTill inline (htmlTag (~== TagClose tag)) blocksInTags :: String -> MWParser Blocks blocksInTags tag = try $ do (_,raw) <- htmlTag (~== TagOpen tag []) if '/' `elem` raw -- self-closing tag then return mempty else mconcat <$> manyTill block (htmlTag (~== TagClose tag)) charsInTags :: String -> MWParser [Char] charsInTags tag = try $ do (_,raw) <- htmlTag (~== TagOpen tag []) if '/' `elem` raw -- self-closing tag then return "" else innerText . parseTags <$> manyTill anyChar (htmlTag (~== TagClose tag)) -- -- main parser -- parseMediaWiki :: MWParser Pandoc parseMediaWiki = do bs <- mconcat <$> many block spaces eof return $ B.doc bs -- -- block parsers -- block :: MWParser Blocks block = mempty <$ skipMany1 blankline <|> header <|> hrule <|> orderedList <|> bulletList <|> definitionList <|> mempty <$ try (spaces *> htmlComment) <|> preformatted <|> blockTag <|> template <|> para para :: MWParser Blocks para = B.para . trimInlines . mconcat <$> many1 inline template :: MWParser Blocks template = B.rawBlock "mediawiki" <$> doublebrackets where doublebrackets = try $ do string "{{" notFollowedBy (char '{') contents <- manyTill anyChar (try $ string "}}") return $ "{{" ++ contents ++ "}}" blockTag :: MWParser Blocks blockTag = do (tag, _) <- lookAhead $ htmlTag isBlockTag' case tag of TagOpen "blockquote" _ -> B.blockQuote <$> blocksInTags "blockquote" TagOpen "pre" _ -> B.codeBlock . trimCode <$> charsInTags "pre" TagOpen "syntaxhighlight" attrs -> syntaxhighlight attrs TagOpen "haskell" _ -> B.codeBlockWith ("",["haskell"],[]) . trimCode <$> charsInTags "haskell" TagOpen "gallery" _ -> blocksInTags "gallery" TagOpen "p" _ -> mempty <$ htmlTag (~== tag) TagClose "p" -> mempty <$ htmlTag (~== tag) _ -> B.rawBlock "html" . snd <$> htmlTag (~== tag) trimCode :: String -> String trimCode ('\n':xs) = stripTrailingNewlines xs trimCode xs = stripTrailingNewlines xs syntaxhighlight :: [Attribute String] -> MWParser Blocks syntaxhighlight attrs = try $ do let mblang = lookup "lang" attrs let mbstart = lookup "start" attrs let mbline = lookup "line" attrs let classes = maybe [] (:[]) mblang ++ maybe [] (const ["numberLines"]) mbline let kvs = maybe [] (\x -> [("startFrom",x)]) mbstart contents <- charsInTags "syntaxhighlight" return $ B.codeBlockWith ("",classes,kvs) $ trimCode contents hrule :: MWParser Blocks hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline) preformatted :: MWParser Blocks preformatted = try $ do getPosition >>= \pos -> guard (sourceColumn pos == 1) char ' ' let endline' = B.linebreak <$ (try $ newline <* char ' ') let whitespace' = B.str <$> many1 ('\160' <$ spaceChar) let spToNbsp ' ' = '\160' spToNbsp x = x let nowiki' = mconcat . intersperse B.linebreak . map B.str . lines . fromEntities . map spToNbsp <$> try (htmlTag (~== TagOpen "nowiki" []) *> manyTill anyChar (htmlTag (~== TagClose "nowiki"))) let inline' = whitespace' <|> endline' <|> nowiki' <|> inline let strToCode (Str s) = Code ("",[],[]) s strToCode x = x B.para . bottomUp strToCode . mconcat <$> many1 inline' header :: MWParser Blocks header = try $ do col <- sourceColumn <$> getPosition guard $ col == 1 -- header must be at beginning of line eqs <- many1 (char '=') let lev = length eqs guard $ lev <= 6 contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=') return $ B.header lev contents bulletList :: MWParser Blocks bulletList = B.bulletList <$> ( many1 (listItem '*') <|> (htmlTag (~== TagOpen "ul" []) *> spaces *> many (listItem '*' <|> li) <* optional (htmlTag (~== TagClose "ul"))) ) orderedList :: MWParser Blocks orderedList = (B.orderedList <$> many1 (listItem '#')) <|> (B.orderedList <$> (htmlTag (~== TagOpen "ul" []) *> spaces *> many (listItem '#' <|> li) <* optional (htmlTag (~== TagClose "ul")))) <|> do (tag,_) <- htmlTag (~== TagOpen "ol" []) spaces items <- many (listItem '#' <|> li) optional (htmlTag (~== TagClose "ol")) let start = maybe 1 id $ safeRead $ fromAttrib "start" tag return $ B.orderedListWith (start, DefaultStyle, DefaultDelim) items definitionList :: MWParser Blocks definitionList = B.definitionList <$> many1 defListItem defListItem :: MWParser (Inlines, [Blocks]) defListItem = try $ do terms <- mconcat . intersperse B.linebreak <$> many defListTerm defs <- many1 $ listItem ':' return (terms, defs) defListTerm :: MWParser Inlines defListTerm = char ';' >> skipMany spaceChar >> manyTill anyChar newline >>= parseFromString (trimInlines . mconcat <$> many inline) listStart :: Char -> MWParser () listStart c = char c *> notFollowedBy listStartChar listStartChar :: MWParser Char listStartChar = oneOf "*#;:" anyListStart :: MWParser Char anyListStart = char '*' <|> char '#' <|> char ':' <|> char ';' li :: MWParser Blocks li = lookAhead (htmlTag (~== TagOpen "li" [])) *> (firstParaToPlain <$> blocksInTags "li") <* spaces listItem :: Char -> MWParser Blocks listItem c = try $ do extras <- many (try $ char c <* lookAhead listStartChar) if null extras then listItem' c else do skipMany spaceChar first <- manyTill anyChar newline rest <- many (try $ string extras *> manyTill anyChar newline) contents <- parseFromString (many1 $ listItem' c) (unlines (first : rest)) case c of '*' -> return $ B.bulletList contents '#' -> return $ B.orderedList contents ':' -> return $ B.definitionList [(mempty, contents)] _ -> mzero listItem' :: Char -> MWParser Blocks listItem' c = try $ do listStart c skipMany spaceChar first <- manyTill anyChar newline rest <- many (try $ char c *> lookAhead listStartChar *> manyTill anyChar newline) parseFromString (firstParaToPlain . mconcat <$> many1 block) $ unlines $ first : rest firstParaToPlain :: Blocks -> Blocks firstParaToPlain contents = case viewl (B.unMany contents) of (Para xs) :< ys -> B.Many $ (Plain xs) <| ys _ -> contents -- -- inline parsers -- inline :: MWParser Inlines inline = whitespace <|> url <|> str <|> strong <|> emph <|> image <|> internalLink <|> externalLink <|> inlineTag <|> B.singleton <$> charRef <|> inlineHtml <|> variable <|> special str :: MWParser Inlines str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars) variable :: MWParser Inlines variable = B.rawInline "mediawiki" <$> triplebrackets where triplebrackets = try $ do string "{{{" contents <- manyTill anyChar (try $ string "}}}") return $ "{{{" ++ contents ++ "}}}" inlineTag :: MWParser Inlines inlineTag = do (tag, _) <- lookAhead $ htmlTag isInlineTag case tag of TagOpen "nowiki" _ -> try $ do (_,raw) <- htmlTag (~== tag) if '/' `elem` raw then return mempty else B.text . fromEntities <$> manyTill anyChar (htmlTag (~== TagClose "nowiki")) TagOpen "br" _ -> B.linebreak <$ (htmlTag (~== TagOpen "br" []) -- will get /> too *> optional blankline) TagOpen "strike" _ -> B.strikeout <$> inlinesInTags "strike" TagOpen "del" _ -> B.strikeout <$> inlinesInTags "del" TagOpen "sub" _ -> B.subscript <$> inlinesInTags "sub" TagOpen "sup" _ -> B.superscript <$> inlinesInTags "sup" TagOpen "math" _ -> B.math <$> charsInTags "math" TagOpen "code" _ -> B.code <$> charsInTags "code" TagOpen "tt" _ -> B.code <$> charsInTags "tt" TagOpen "hask" _ -> B.codeWith ("",["haskell"],[]) <$> charsInTags "hask" _ -> B.rawInline "html" . snd <$> htmlTag (~== tag) special :: MWParser Inlines special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag') *> oneOf specialChars) inlineHtml :: MWParser Inlines inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag whitespace :: MWParser Inlines whitespace = B.space <$ (skipMany1 spaceChar <|> endline <|> htmlComment) endline :: MWParser () endline = () <$ try (newline <* notFollowedBy blankline <* notFollowedBy' hrule <* notFollowedBy anyListStart) image :: MWParser Inlines image = try $ do sym "[[" sym "File:" fname <- many1 (noneOf "|]") _ <- many (try $ char '|' *> imageOption) caption <- (B.str fname <$ sym "]]") <|> try (char '|' *> (mconcat <$> manyTill inline (sym "]]"))) return $ B.image fname "image" caption imageOption :: MWParser String imageOption = try (oneOfStrings [ "border", "thumbnail", "frameless" , "thumb", "upright", "left", "right" , "center", "none", "baseline", "sub" , "super", "top", "text-top", "middle" , "bottom", "text-bottom" ]) <|> try (string "frame") <|> try (many1 (oneOf "x0123456789") <* string "px") <|> try (oneOfStrings ["link=","alt=","page=","class="] <* many (noneOf "|]")) internalLink :: MWParser Inlines internalLink = try $ do sym "[[" let addUnderscores x = let (pref,suff) = break (=='#') x in pref ++ intercalate "_" (words suff) pagename <- unwords . words <$> many (noneOf "|]") label <- option (B.text pagename) $ char '|' *> ( (mconcat <$> many1 (notFollowedBy (char ']') *> inline)) -- the "pipe trick" -- [[Help:Contents|] -> "Contents" <|> (return $ B.text $ drop 1 $ dropWhile (/=':') pagename) ) sym "]]" linktrail <- B.text <$> many (char '\'' <|> letter) return $ B.link (addUnderscores pagename) "wikilink" (label <> linktrail) externalLink :: MWParser Inlines externalLink = try $ do char '[' (_, src) <- uri lab <- try (trimInlines . mconcat <$> (skipMany1 spaceChar *> manyTill inline (char ']'))) <|> do char ']' num <- mwNextLinkNumber <$> getState updateState $ \st -> st{ mwNextLinkNumber = num + 1 } return $ B.str $ show num return $ B.link src "" lab url :: MWParser Inlines url = do (orig, src) <- uri return $ B.link src "" (B.str orig) -- | Parses a list of inlines between start and end delimiters. inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines inlinesBetween start end = (trimInlines . mconcat) <$> try (start >> many1Till inner end) where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) innerSpace = try $ whitespace >>~ notFollowedBy' end emph :: MWParser Inlines emph = B.emph <$> nested (inlinesBetween start end) where start = sym "''" >> lookAhead nonspaceChar end = try $ notFollowedBy' (() <$ strong) >> sym "''" strong :: MWParser Inlines strong = B.strong <$> nested (inlinesBetween start end) where start = sym "'''" >> lookAhead nonspaceChar end = try $ sym "'''"