aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
authorJohn MacFarlane <fiddlosopher@gmail.com>2012-09-10 10:02:12 -0700
committerJohn MacFarlane <fiddlosopher@gmail.com>2012-09-12 09:05:10 -0700
commit3d361b2101c097ecde343625b15da8c197d733eb (patch)
tree6c68d7003a64ed263223999e22d112644472aa11 /src/Text/Pandoc/Readers
parent167012daf75436208bcf275164792f3ec06ee56c (diff)
downloadpandoc-3d361b2101c097ecde343625b15da8c197d733eb.tar.gz
Added basic mediawiki reader.
Text.Pandoc.Readers.MediaWiki module, tests/mediawiki-reader.{txt,native}.
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs2
-rw-r--r--src/Text/Pandoc/Readers/MediaWiki.hs311
2 files changed, 311 insertions, 2 deletions
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index d3d4e72ff..1c2cc12f1 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -1,6 +1,4 @@
{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
-{-# LANGUAGE FlexibleInstances, TypeSynonymInstances,
- GeneralizedNewtypeDeriving #-}
{-
Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
new file mode 100644
index 000000000..f3adbe72e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -0,0 +1,311 @@
+{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
+{-
+Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.MediaWiki
+ Copyright : Copyright (C) 2012 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of mediawiki text to 'Pandoc' document.
+-}
+{-
+TODO:
+_ fix pre parser -- it should use html tagsoup parsers,
+ then just strip out the text from text tags.
+_ correctly handle skipped level in list, e.g. # to ###
+_ tests for lists
+_ support HTML lists
+_ support list style attributes and start values in ol lists, also
+ value attribute on li
+_ support <p> tags in lists (and out?)
+_ support :, ::, etc. for indent (treat as list continuation paras?)
+_ support preformatted text (lines starting with space)
+_ support preformatted text blocks
+_ code highlighting: http://www.mediawiki.org/wiki/Extension:SyntaxHighlight_GeSHi <syntaxhighlight lang="php"> (alternativel, <source...>)
+ if 'line' attribute present, number lines
+ if 'start' present, set starting line number
+_ support internal links http://www.mediawiki.org/wiki/Help:Links
+_ support external links
+_ support automatic linkification of URLs
+_ support images http://www.mediawiki.org/wiki/Help:Images
+_ ignore gallery tag?
+_ support tables http://www.mediawiki.org/wiki/Help:Tables
+_ support <math> tag for latex math
+_ templates or anything in {{}} -> handle as raw wikimedia, can be dealt with in
+ postprocessing
+_ category links
+_ tests for raw html inline
+_ tests for sup, sub, del
+_ tests for pre, haskell
+_ tests for code, tt, hask
+_ test for blockquote
+-}
+module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where
+
+import Text.Pandoc.Definition
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>))
+import Text.Pandoc.Options
+import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced,
+ isInlineTag, isBlockTag, isTextTag, isCommentTag )
+import Text.Pandoc.XML ( fromEntities )
+import Text.Pandoc.Parsing
+import Text.Pandoc.Shared ( stripTrailingNewlines )
+import Data.Monoid (mconcat, mempty)
+import Control.Applicative ((<$>), (<*), (*>), (<$))
+import Control.Monad
+import Data.List (intersperse)
+import Text.HTML.TagSoup
+
+-- | Read mediawiki from an input string and return a Pandoc document.
+readMediaWiki :: ReaderOptions -- ^ Reader options
+ -> String -- ^ String to parse (assuming @'\n'@ line endings)
+ -> Pandoc
+readMediaWiki opts s =
+ (readWith parseMediaWiki) def{ stateOptions = opts } (s ++ "\n\n")
+
+type MWParser = Parser [Char] ParserState
+
+--
+-- auxiliary functions
+--
+
+specialChars :: [Char]
+specialChars = "'[]<=&*"
+
+spaceChars :: [Char]
+spaceChars = " \n\t"
+
+sym :: String -> MWParser ()
+sym s = () <$ try (string s)
+
+htmlComment :: MWParser ()
+htmlComment = () <$ htmlTag isCommentTag
+
+inlinesInTags :: String -> MWParser Inlines
+inlinesInTags tag = trimInlines . mconcat <$> try
+ (htmlTag (~== TagOpen tag []) *>
+ manyTill inline (htmlTag (~== TagClose tag)))
+
+blocksInTags :: String -> MWParser Blocks
+blocksInTags tag = mconcat <$> try
+ (htmlTag (~== TagOpen tag []) *>
+ manyTill block (htmlTag (~== TagClose tag)))
+
+charsInTags :: String -> MWParser [Char]
+charsInTags tag = fromEntities <$> try
+ (htmlTag (~== TagOpen tag []) *>
+ manyTill anyChar (htmlTag (~== TagClose tag)))
+
+--
+-- main parser
+--
+
+parseMediaWiki :: MWParser Pandoc
+parseMediaWiki = do
+ bs <- mconcat <$> many block
+ spaces
+ eof
+ return $ B.doc bs
+
+--
+-- block parsers
+--
+
+block :: MWParser Blocks
+block = header
+ <|> hrule
+ <|> bulletList
+ <|> orderedList
+ <|> definitionList
+ <|> blockquote
+ <|> codeblock
+ <|> haskell
+ <|> mempty <$ skipMany1 blankline
+ <|> mempty <$ try (spaces *> htmlComment)
+ <|> para
+
+para :: MWParser Blocks
+para = B.para . trimInlines . mconcat <$> many1 inline
+
+hrule :: MWParser Blocks
+hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline)
+
+blockquote :: MWParser Blocks
+blockquote = B.blockQuote <$> blocksInTags "blockquote"
+
+codeblock :: MWParser Blocks
+codeblock = B.codeBlock . trimCode <$> charsInTags "pre"
+
+trimCode :: String -> String
+trimCode ('\n':xs) = stripTrailingNewlines xs
+trimCode xs = stripTrailingNewlines xs
+
+haskell :: MWParser Blocks
+haskell = B.codeBlockWith ("",["haskell"],[]) . trimCode <$>
+ charsInTags "haskell"
+
+header :: MWParser Blocks
+header = try $ do
+ col <- sourceColumn <$> getPosition
+ guard $ col == 1 -- header must be at beginning of line
+ eqs <- many1 (char '=')
+ let lev = length eqs
+ guard $ lev <= 6
+ contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=')
+ return $ B.header lev contents
+
+bulletList :: MWParser Blocks
+bulletList = B.bulletList <$> many1 (listItem '*')
+
+orderedList :: MWParser Blocks
+orderedList = B.orderedList <$> many1 (listItem '#')
+
+definitionList :: MWParser Blocks
+definitionList = B.definitionList <$> many1 defListItem
+
+defListItem :: MWParser (Inlines, [Blocks])
+defListItem = try $ do
+ terms <- mconcat . intersperse B.linebreak <$> many1 defListTerm
+ defs <- many1 $ listItem ':'
+ return (terms, defs)
+
+defListTerm :: MWParser Inlines
+defListTerm = char ';' >> skipMany spaceChar >> manyTill anyChar newline >>=
+ parseFromString (trimInlines . mconcat <$> many inline)
+
+listStart :: Char -> MWParser ()
+listStart c = char c *> notFollowedBy listStartChar
+
+listStartChar :: MWParser Char
+listStartChar = oneOf "*#;:"
+
+anyListStart :: MWParser ()
+anyListStart = listStart '*' <|> listStart '#' <|> listStart ';'
+
+listItem :: Char -> MWParser Blocks
+listItem c = try $ do
+ listStart c
+ first <- manyTill anyChar newline
+ rest <- many (try $ char c *> lookAhead listStartChar *>
+ manyTill anyChar newline)
+ parseFromString (mconcat <$> many1 block) $ unlines $ first : rest
+
+--
+-- inline parsers
+--
+
+inline :: MWParser Inlines
+inline = whitespace
+ <|> url
+ <|> str
+ <|> strong
+ <|> emph
+ <|> nowiki
+ <|> linebreak
+ <|> externalLink
+ <|> strikeout
+ <|> subscript
+ <|> superscript
+ <|> code
+ <|> hask
+ <|> B.singleton <$> charRef
+ <|> inlineHtml
+ <|> special
+
+str :: MWParser Inlines
+str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars)
+
+special :: MWParser Inlines
+special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag) *>
+ oneOf specialChars)
+
+inlineHtml :: MWParser Inlines
+inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag
+
+whitespace :: MWParser Inlines
+whitespace = B.space <$ (skipMany1 spaceChar <|> endline <|> htmlComment)
+
+endline :: MWParser ()
+endline = () <$ try (newline <*
+ notFollowedBy blankline <*
+ notFollowedBy' hrule <*
+ notFollowedBy anyListStart)
+
+linebreak :: MWParser Inlines
+linebreak = B.linebreak <$
+ (htmlTag (~== TagOpen "br" []) *>
+ optional (htmlTag (~== TagClose "br")) *>
+ optional blankline)
+
+externalLink :: MWParser Inlines
+externalLink = try $ do
+ char '['
+ (orig, src) <- uri
+ skipMany1 spaceChar
+ lab <- manyTill inline (char ']')
+ let lab' = if null lab
+ then [B.str "1"] -- TODO generate sequentially from state
+ else lab
+ return $ B.link src "" $ trimInlines $ mconcat lab'
+
+url :: MWParser Inlines
+url = do
+ (_, src) <- uri
+ return $ B.link src "" (B.str orig)
+
+nowiki :: MWParser Inlines
+nowiki = B.text <$> charsInTags "nowiki"
+
+strikeout :: MWParser Inlines
+strikeout = B.strikeout <$> (inlinesInTags "strike" <|> inlinesInTags "del")
+
+superscript :: MWParser Inlines
+superscript = B.superscript <$> inlinesInTags "sup"
+
+subscript :: MWParser Inlines
+subscript = B.subscript <$> inlinesInTags "sub"
+
+code :: MWParser Inlines
+code = B.code <$> (charsInTags "code" <|> charsInTags "tt")
+
+hask :: MWParser Inlines
+hask = B.codeWith ("",["haskell"],[]) <$> charsInTags "hask"
+
+-- | Parses a list of inlines between start and end delimiters.
+inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines
+inlinesBetween start end =
+ (trimInlines . mconcat) <$> try (start >> many1Till inner end)
+ where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
+ innerSpace = try $ whitespace >>~ notFollowedBy' end
+
+emph :: MWParser Inlines
+emph = B.emph <$> nested (inlinesBetween start end)
+ where start = sym "''" >> lookAhead nonspaceChar
+ end = try $ notFollowedBy' (() <$ strong) >> sym "''"
+
+strong :: MWParser Inlines
+strong = B.strong <$> nested (inlinesBetween start end)
+ where start = sym "'''" >> lookAhead nonspaceChar
+ end = try $ sym "'''"
+