From 3d361b2101c097ecde343625b15da8c197d733eb Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 10 Sep 2012 10:02:12 -0700 Subject: Added basic mediawiki reader. Text.Pandoc.Readers.MediaWiki module, tests/mediawiki-reader.{txt,native}. --- src/Text/Pandoc/Readers/Markdown.hs | 2 - src/Text/Pandoc/Readers/MediaWiki.hs | 311 +++++++++++++++++++++++++++++++++++ 2 files changed, 311 insertions(+), 2 deletions(-) create mode 100644 src/Text/Pandoc/Readers/MediaWiki.hs (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index d3d4e72ff..1c2cc12f1 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1,6 +1,4 @@ {-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 -{-# LANGUAGE FlexibleInstances, TypeSynonymInstances, - GeneralizedNewtypeDeriving #-} {- Copyright (C) 2006-2010 John MacFarlane diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs new file mode 100644 index 000000000..f3adbe72e --- /dev/null +++ b/src/Text/Pandoc/Readers/MediaWiki.hs @@ -0,0 +1,311 @@ +{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 +{- +Copyright (C) 2012 John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.MediaWiki + Copyright : Copyright (C) 2012 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Conversion of mediawiki text to 'Pandoc' document. +-} +{- +TODO: +_ fix pre parser -- it should use html tagsoup parsers, + then just strip out the text from text tags. +_ correctly handle skipped level in list, e.g. # to ### +_ tests for lists +_ support HTML lists +_ support list style attributes and start values in ol lists, also + value attribute on li +_ support

tags in lists (and out?) +_ support :, ::, etc. for indent (treat as list continuation paras?) +_ support preformatted text (lines starting with space) +_ support preformatted text blocks +_ code highlighting: http://www.mediawiki.org/wiki/Extension:SyntaxHighlight_GeSHi (alternativel, ) + if 'line' attribute present, number lines + if 'start' present, set starting line number +_ support internal links http://www.mediawiki.org/wiki/Help:Links +_ support external links +_ support automatic linkification of URLs +_ support images http://www.mediawiki.org/wiki/Help:Images +_ ignore gallery tag? +_ support tables http://www.mediawiki.org/wiki/Help:Tables +_ support tag for latex math +_ templates or anything in {{}} -> handle as raw wikimedia, can be dealt with in + postprocessing +_ category links +_ tests for raw html inline +_ tests for sup, sub, del +_ tests for pre, haskell +_ tests for code, tt, hask +_ test for blockquote +-} +module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where + +import Text.Pandoc.Definition +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>)) +import Text.Pandoc.Options +import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, + isInlineTag, isBlockTag, isTextTag, isCommentTag ) +import Text.Pandoc.XML ( fromEntities ) +import Text.Pandoc.Parsing +import Text.Pandoc.Shared ( stripTrailingNewlines ) +import Data.Monoid (mconcat, mempty) +import Control.Applicative ((<$>), (<*), (*>), (<$)) +import Control.Monad +import Data.List (intersperse) +import Text.HTML.TagSoup + +-- | Read mediawiki from an input string and return a Pandoc document. +readMediaWiki :: ReaderOptions -- ^ Reader options + -> String -- ^ String to parse (assuming @'\n'@ line endings) + -> Pandoc +readMediaWiki opts s = + (readWith parseMediaWiki) def{ stateOptions = opts } (s ++ "\n\n") + +type MWParser = Parser [Char] ParserState + +-- +-- auxiliary functions +-- + +specialChars :: [Char] +specialChars = "'[]<=&*" + +spaceChars :: [Char] +spaceChars = " \n\t" + +sym :: String -> MWParser () +sym s = () <$ try (string s) + +htmlComment :: MWParser () +htmlComment = () <$ htmlTag isCommentTag + +inlinesInTags :: String -> MWParser Inlines +inlinesInTags tag = trimInlines . mconcat <$> try + (htmlTag (~== TagOpen tag []) *> + manyTill inline (htmlTag (~== TagClose tag))) + +blocksInTags :: String -> MWParser Blocks +blocksInTags tag = mconcat <$> try + (htmlTag (~== TagOpen tag []) *> + manyTill block (htmlTag (~== TagClose tag))) + +charsInTags :: String -> MWParser [Char] +charsInTags tag = fromEntities <$> try + (htmlTag (~== TagOpen tag []) *> + manyTill anyChar (htmlTag (~== TagClose tag))) + +-- +-- main parser +-- + +parseMediaWiki :: MWParser Pandoc +parseMediaWiki = do + bs <- mconcat <$> many block + spaces + eof + return $ B.doc bs + +-- +-- block parsers +-- + +block :: MWParser Blocks +block = header + <|> hrule + <|> bulletList + <|> orderedList + <|> definitionList + <|> blockquote + <|> codeblock + <|> haskell + <|> mempty <$ skipMany1 blankline + <|> mempty <$ try (spaces *> htmlComment) + <|> para + +para :: MWParser Blocks +para = B.para . trimInlines . mconcat <$> many1 inline + +hrule :: MWParser Blocks +hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline) + +blockquote :: MWParser Blocks +blockquote = B.blockQuote <$> blocksInTags "blockquote" + +codeblock :: MWParser Blocks +codeblock = B.codeBlock . trimCode <$> charsInTags "pre" + +trimCode :: String -> String +trimCode ('\n':xs) = stripTrailingNewlines xs +trimCode xs = stripTrailingNewlines xs + +haskell :: MWParser Blocks +haskell = B.codeBlockWith ("",["haskell"],[]) . trimCode <$> + charsInTags "haskell" + +header :: MWParser Blocks +header = try $ do + col <- sourceColumn <$> getPosition + guard $ col == 1 -- header must be at beginning of line + eqs <- many1 (char '=') + let lev = length eqs + guard $ lev <= 6 + contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=') + return $ B.header lev contents + +bulletList :: MWParser Blocks +bulletList = B.bulletList <$> many1 (listItem '*') + +orderedList :: MWParser Blocks +orderedList = B.orderedList <$> many1 (listItem '#') + +definitionList :: MWParser Blocks +definitionList = B.definitionList <$> many1 defListItem + +defListItem :: MWParser (Inlines, [Blocks]) +defListItem = try $ do + terms <- mconcat . intersperse B.linebreak <$> many1 defListTerm + defs <- many1 $ listItem ':' + return (terms, defs) + +defListTerm :: MWParser Inlines +defListTerm = char ';' >> skipMany spaceChar >> manyTill anyChar newline >>= + parseFromString (trimInlines . mconcat <$> many inline) + +listStart :: Char -> MWParser () +listStart c = char c *> notFollowedBy listStartChar + +listStartChar :: MWParser Char +listStartChar = oneOf "*#;:" + +anyListStart :: MWParser () +anyListStart = listStart '*' <|> listStart '#' <|> listStart ';' + +listItem :: Char -> MWParser Blocks +listItem c = try $ do + listStart c + first <- manyTill anyChar newline + rest <- many (try $ char c *> lookAhead listStartChar *> + manyTill anyChar newline) + parseFromString (mconcat <$> many1 block) $ unlines $ first : rest + +-- +-- inline parsers +-- + +inline :: MWParser Inlines +inline = whitespace + <|> url + <|> str + <|> strong + <|> emph + <|> nowiki + <|> linebreak + <|> externalLink + <|> strikeout + <|> subscript + <|> superscript + <|> code + <|> hask + <|> B.singleton <$> charRef + <|> inlineHtml + <|> special + +str :: MWParser Inlines +str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars) + +special :: MWParser Inlines +special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag) *> + oneOf specialChars) + +inlineHtml :: MWParser Inlines +inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag + +whitespace :: MWParser Inlines +whitespace = B.space <$ (skipMany1 spaceChar <|> endline <|> htmlComment) + +endline :: MWParser () +endline = () <$ try (newline <* + notFollowedBy blankline <* + notFollowedBy' hrule <* + notFollowedBy anyListStart) + +linebreak :: MWParser Inlines +linebreak = B.linebreak <$ + (htmlTag (~== TagOpen "br" []) *> + optional (htmlTag (~== TagClose "br")) *> + optional blankline) + +externalLink :: MWParser Inlines +externalLink = try $ do + char '[' + (orig, src) <- uri + skipMany1 spaceChar + lab <- manyTill inline (char ']') + let lab' = if null lab + then [B.str "1"] -- TODO generate sequentially from state + else lab + return $ B.link src "" $ trimInlines $ mconcat lab' + +url :: MWParser Inlines +url = do + (_, src) <- uri + return $ B.link src "" (B.str orig) + +nowiki :: MWParser Inlines +nowiki = B.text <$> charsInTags "nowiki" + +strikeout :: MWParser Inlines +strikeout = B.strikeout <$> (inlinesInTags "strike" <|> inlinesInTags "del") + +superscript :: MWParser Inlines +superscript = B.superscript <$> inlinesInTags "sup" + +subscript :: MWParser Inlines +subscript = B.subscript <$> inlinesInTags "sub" + +code :: MWParser Inlines +code = B.code <$> (charsInTags "code" <|> charsInTags "tt") + +hask :: MWParser Inlines +hask = B.codeWith ("",["haskell"],[]) <$> charsInTags "hask" + +-- | Parses a list of inlines between start and end delimiters. +inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines +inlinesBetween start end = + (trimInlines . mconcat) <$> try (start >> many1Till inner end) + where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline) + innerSpace = try $ whitespace >>~ notFollowedBy' end + +emph :: MWParser Inlines +emph = B.emph <$> nested (inlinesBetween start end) + where start = sym "''" >> lookAhead nonspaceChar + end = try $ notFollowedBy' (() <$ strong) >> sym "''" + +strong :: MWParser Inlines +strong = B.strong <$> nested (inlinesBetween start end) + where start = sym "'''" >> lookAhead nonspaceChar + end = try $ sym "'''" + -- cgit v1.2.3