{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 {- Copyright (C) 2012 John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | Module : Text.Pandoc.Readers.MediaWiki Copyright : Copyright (C) 2012 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane Stability : alpha Portability : portable Conversion of mediawiki text to 'Pandoc' document. -} {- TODO: _ tests for lists _ support HTML lists _ support list style attributes and start values in ol lists, also value attribute on li _ support preformatted text (lines starting with space) _ support preformatted text blocks _ code highlighting: http://www.mediawiki.org/wiki/Extension:SyntaxHighlight_GeSHi (alternativel, ) if 'line' attribute present, number lines if 'start' present, set starting line number _ support internal links http://www.mediawiki.org/wiki/Help:Links _ support external links _ support automatic linkification of URLs _ support images http://www.mediawiki.org/wiki/Help:Images _ ignore gallery tag? _ support tables http://www.mediawiki.org/wiki/Help:Tables _ support

tag for latex math
_ templates or anything in {{}} -> handle as raw wikimedia, can be dealt with in
  postprocessing
_ category links
_ tests for raw html inline
_ tests for sup, sub, del
_ tests for pre, haskell
_ tests for code, tt, hask
_ test for blockquote
-}
module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where

import Text.Pandoc.Definition
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>))
import Text.Pandoc.Options
import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced,
           isInlineTag, isBlockTag, isTextTag, isCommentTag )
import Text.Pandoc.XML ( fromEntities )
import Text.Pandoc.Parsing
import Text.Pandoc.Shared ( stripTrailingNewlines )
import Data.Monoid (mconcat, mempty)
import qualified Data.Foldable as F
import Control.Applicative ((<$>), (<*), (*>), (<$))
import Control.Monad
import Data.List (intersperse)
import Text.HTML.TagSoup
import Data.Sequence (viewl, ViewL(..), (<|))

-- | Read mediawiki from an input string and return a Pandoc document.
readMediaWiki :: ReaderOptions -- ^ Reader options
               -> String        -- ^ String to parse (assuming @'\n'@ line endings)
               -> Pandoc
readMediaWiki opts s =
  (readWith parseMediaWiki) def{ stateOptions = opts } (s ++ "\n\n")

type MWParser = Parser [Char] ParserState

--
-- auxiliary functions
--

specialChars :: [Char]
specialChars = "'[]<=&*"

spaceChars :: [Char]
spaceChars = " \n\t"

sym :: String -> MWParser ()
sym s = () <$ try (string s)

htmlComment :: MWParser ()
htmlComment = () <$ htmlTag isCommentTag

inlinesInTags :: String -> MWParser Inlines
inlinesInTags tag = trimInlines . mconcat <$> try
  (htmlTag (~== TagOpen tag []) *>
   manyTill inline (htmlTag (~== TagClose tag)))

blocksInTags :: String -> MWParser Blocks
blocksInTags tag = mconcat <$> try
  (htmlTag (~== TagOpen tag []) *>
   manyTill block (htmlTag (~== TagClose tag)))

charsInTags :: String -> MWParser [Char]
charsInTags tag = innerText . parseTags <$> try
  (htmlTag (~== TagOpen tag []) *>
   manyTill anyChar (htmlTag (~== TagClose tag)))

--
-- main parser
--

parseMediaWiki :: MWParser Pandoc
parseMediaWiki = do
  bs <- mconcat <$> many block
  spaces
  eof
  return $ B.doc bs

--
-- block parsers
--

block :: MWParser Blocks
block = header
     <|> hrule
     <|> bulletList
     <|> orderedList
     <|> definitionList
     <|> blockquote
     <|> codeblock
     <|> haskell
     <|> mempty <$ skipMany1 blankline
     <|> mempty <$ try (spaces *> htmlComment)
     <|> pTag
     <|> blockHtml
     <|> para

para :: MWParser Blocks
para = B.para . trimInlines . mconcat <$> many1 inline

-- We can just skip pTags, as contents will be treated as paragraphs
pTag :: MWParser Blocks
pTag = mempty <$ (htmlTag (\t -> t ~== TagOpen "p" [] || t ~== TagClose "p"))

blockHtml :: MWParser Blocks
blockHtml = (B.rawBlock "html" . snd <$> htmlTag isBlockTag)

hrule :: MWParser Blocks
hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline)

blockquote :: MWParser Blocks
blockquote = B.blockQuote <$> blocksInTags "blockquote"

codeblock :: MWParser Blocks
codeblock = B.codeBlock . trimCode <$> charsInTags "pre"

trimCode :: String -> String
trimCode ('\n':xs) = stripTrailingNewlines xs
trimCode xs        = stripTrailingNewlines xs

haskell :: MWParser Blocks
haskell = B.codeBlockWith ("",["haskell"],[]) . trimCode <$>
             charsInTags "haskell"

header :: MWParser Blocks
header = try $ do
  col <- sourceColumn <$> getPosition
  guard $ col == 1  -- header must be at beginning of line
  eqs <- many1 (char '=')
  let lev = length eqs
  guard $ lev <= 6
  contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=')
  return $ B.header lev contents

bulletList :: MWParser Blocks
bulletList = B.bulletList <$> many1 (listItem '*')

orderedList :: MWParser Blocks
orderedList = B.orderedList <$> many1 (listItem '#')

definitionList :: MWParser Blocks
definitionList = B.definitionList <$> many1 defListItem

defListItem :: MWParser (Inlines, [Blocks])
defListItem = try $ do
  terms <- mconcat . intersperse B.linebreak <$> many defListTerm
  defs  <- many1 $ listItem ':'
  return (terms, defs)

defListTerm  :: MWParser Inlines
defListTerm = char ';' >> skipMany spaceChar >> manyTill anyChar newline >>=
  parseFromString (trimInlines . mconcat <$> many inline)

listStart :: Char -> MWParser ()
listStart c = char c *> notFollowedBy listStartChar

listStartChar :: MWParser Char
listStartChar = oneOf "*#;:"

anyListStart :: MWParser Char
anyListStart =  char '*'
            <|> char '#'
            <|> char ':'
            <|> char ';'

listItem :: Char -> MWParser Blocks
listItem c = try $ do
  extras <- many (try $ char c <* lookAhead listStartChar)
  if null extras
     then listItem' c
     else do
       first <- manyTill anyChar newline
       rest <- many (try $ string extras *> manyTill anyChar newline)
       contents <- parseFromString (many1 $ listItem' c)
                          (unlines (first : rest))
       case c of
           '*'  -> return $ B.bulletList contents
           '#'  -> return $ B.orderedList contents
           ':'  -> return $ B.definitionList [(mempty, contents)]
           _    -> mzero

listItem' :: Char -> MWParser Blocks
listItem' c = try $ do
  listStart c
  first <- manyTill anyChar newline
  rest <- many (try $ char c *> lookAhead listStartChar *>
                   manyTill anyChar newline)
  contents <- parseFromString (mconcat <$> many1 block)
               $ unlines $ first : rest
  case viewl (B.unMany contents) of
       (Para xs) :< rest -> return $ B.Many $ (Plain xs) <| rest
       _                 -> return contents

--
-- inline parsers
--

inline :: MWParser Inlines
inline =  whitespace
      <|> url
      <|> str
      <|> strong
      <|> emph
      <|> nowiki
      <|> linebreak
      <|> externalLink
      <|> strikeout
      <|> subscript
      <|> superscript
      <|> code
      <|> hask
      <|> B.singleton <$> charRef
      <|> inlineHtml
      <|> special

str :: MWParser Inlines
str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars)

special :: MWParser Inlines
special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag) *>
                             oneOf specialChars)

inlineHtml :: MWParser Inlines
inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag

whitespace :: MWParser Inlines
whitespace = B.space <$ (skipMany1 spaceChar <|> endline <|> htmlComment)

endline :: MWParser ()
endline = () <$ try (newline <*
                     notFollowedBy blankline <*
                     notFollowedBy' hrule <*
                     notFollowedBy anyListStart)

linebreak :: MWParser Inlines
linebreak = B.linebreak <$
  (htmlTag (~== TagOpen "br" []) *>
   optional (htmlTag (~== TagClose "br")) *>
   optional blankline)

externalLink :: MWParser Inlines
externalLink = try $ do
  char '['
  (_, src) <- uri
  skipMany1 spaceChar
  lab <- manyTill inline (char ']')
  let lab' = if null lab
                then [B.str "1"] -- TODO generate sequentially from state
                else lab
  return $ B.link src "" $ trimInlines $ mconcat lab'

url :: MWParser Inlines
url = do
  (orig, src) <- uri
  return $ B.link src "" (B.str orig)

nowiki :: MWParser Inlines
nowiki = B.text <$> charsInTags "nowiki"

strikeout :: MWParser Inlines
strikeout = B.strikeout <$> (inlinesInTags "strike" <|> inlinesInTags "del")

superscript :: MWParser Inlines
superscript = B.superscript <$> inlinesInTags "sup"

subscript :: MWParser Inlines
subscript = B.subscript <$> inlinesInTags "sub"

code :: MWParser Inlines
code = B.code <$> (charsInTags "code" <|> charsInTags "tt")

hask :: MWParser Inlines
hask = B.codeWith ("",["haskell"],[]) <$> charsInTags "hask"

-- | Parses a list of inlines between start and end delimiters.
inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines
inlinesBetween start end =
  (trimInlines . mconcat) <$> try (start >> many1Till inner end)
    where inner      = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
          innerSpace = try $ whitespace >>~ notFollowedBy' end

emph :: MWParser Inlines
emph = B.emph <$> nested (inlinesBetween start end)
    where start = sym "''" >> lookAhead nonspaceChar
          end   = try $ notFollowedBy' (() <$ strong) >> sym "''"

strong :: MWParser Inlines
strong = B.strong <$> nested (inlinesBetween start end)
    where start = sym "'''" >> lookAhead nonspaceChar
          end   = try $ sym "'''"