aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs16
-rw-r--r--src/Text/Pandoc/Readers/LaTeX.hs36
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs14
-rw-r--r--src/Text/Pandoc/Readers/MediaWiki.hs594
-rw-r--r--src/Text/Pandoc/Readers/RST.hs2
-rw-r--r--src/Text/Pandoc/Readers/Textile.hs16
6 files changed, 637 insertions, 41 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index e5c310ffc..424d9bdec 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -271,6 +271,7 @@ pCodeBlock = try $ do
inline :: TagParser [Inline]
inline = choice
[ pTagText
+ , pQ
, pEmph
, pStrong
, pSuperscript
@@ -306,6 +307,17 @@ pSelfClosing f g = do
optional $ pSatisfy (tagClose f)
return open
+pQ :: TagParser [Inline]
+pQ = do
+ quoteContext <- stateQuoteContext `fmap` getState
+ let quoteType = case quoteContext of
+ InDoubleQuote -> SingleQuote
+ _ -> DoubleQuote
+ let innerQuoteContext = if quoteType == SingleQuote
+ then InSingleQuote
+ else InDoubleQuote
+ withQuoteContext innerQuoteContext $ pInlinesInTags "q" (Quoted quoteType)
+
pEmph :: TagParser [Inline]
pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph
@@ -585,9 +597,9 @@ htmlInBalanced f = try $ do
return $ tag ++ concat contents ++ endtag
-- | Matches a tag meeting a certain condition.
-htmlTag :: (Tag String -> Bool) -> Parser [Char] ParserState (Tag String, String)
+htmlTag :: (Tag String -> Bool) -> Parser [Char] st (Tag String, String)
htmlTag f = try $ do
- lookAhead (char '<')
+ lookAhead $ char '<' >> (oneOf "/!?" <|> letter)
(next : _) <- getInput >>= return . canonicalizeTags . parseTags
guard $ f next
-- advance the parser
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index 4a5a14d6a..86ae400de 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -166,10 +166,8 @@ double_quote = (doubleQuoted . mconcat) <$>
(try $ string "``" *> manyTill inline (try $ string "''"))
single_quote :: LP Inlines
-single_quote = char '`' *>
- ( try ((singleQuoted . mconcat) <$>
- manyTill inline (try $ char '\'' >> notFollowedBy letter))
- <|> lit "`")
+single_quote = (singleQuoted . mconcat) <$>
+ (try $ char '`' *> manyTill inline (try $ char '\'' >> notFollowedBy letter))
inline :: LP Inlines
inline = (mempty <$ comment)
@@ -181,6 +179,9 @@ inline = (mempty <$ comment)
((char '-') *> option (str "–") (str "—" <$ char '-')))
<|> double_quote
<|> single_quote
+ <|> (str "“" <$ try (string "``")) -- nb. {``} won't be caught by double_quote
+ <|> (str "”" <$ try (string "''"))
+ <|> (str "‘" <$ char '`') -- nb. {`} won't be caught by single_quote
<|> (str "’" <$ char '\'')
<|> (str "\160" <$ char '~')
<|> (mathDisplay $ string "$$" *> mathChars <* string "$$")
@@ -188,10 +189,9 @@ inline = (mempty <$ comment)
<|> (superscript <$> (char '^' *> tok))
<|> (subscript <$> (char '_' *> tok))
<|> (guardEnabled Ext_literate_haskell *> char '|' *> doLHSverb)
- <|> (str <$> count 1 tildeEscape)
- <|> (str <$> string "]")
- <|> (str <$> string "#") -- TODO print warning?
- <|> (str <$> string "&") -- TODO print warning?
+ <|> (str . (:[]) <$> tildeEscape)
+ <|> (str . (:[]) <$> oneOf "[]")
+ <|> (str . (:[]) <$> oneOf "#&") -- TODO print warning?
-- <|> (str <$> count 1 (satisfy (\c -> c /= '\\' && c /='\n' && c /='}' && c /='{'))) -- eat random leftover characters
inlines :: LP Inlines
@@ -203,8 +203,8 @@ block = (mempty <$ comment)
<|> environment
<|> mempty <$ macro -- TODO improve macros, make them work everywhere
<|> blockCommand
- <|> grouped block
<|> paragraph
+ <|> grouped block
<|> (mempty <$ char '&') -- loose & in table environment
@@ -214,6 +214,7 @@ blocks = mconcat <$> many block
blockCommand :: LP Blocks
blockCommand = try $ do
name <- anyControlSeq
+ guard $ name /= "begin" && name /= "end"
star <- option "" (string "*" <* optional sp)
let name' = name ++ star
case M.lookup name' blockCommands of
@@ -265,8 +266,6 @@ blockCommands = M.fromList $
, ("closing", skipopts *> closing)
--
, ("rule", skipopts *> tok *> tok *> pure horizontalRule)
- , ("begin", mzero) -- these are here so they won't be interpreted as inline
- , ("end", mzero)
, ("item", skipopts *> loose_item)
, ("documentclass", skipopts *> braced *> preamble)
, ("centerline", (para . trimInlines) <$> (skipopts *> tok))
@@ -321,6 +320,7 @@ section lvl = do
inlineCommand :: LP Inlines
inlineCommand = try $ do
name <- anyControlSeq
+ guard $ name /= "begin" && name /= "end"
guard $ not $ isBlockCommand name
parseRaw <- getOption readerParseRaw
star <- option "" (string "*")
@@ -352,6 +352,7 @@ inlineCommands = M.fromList $
, ("textsubscript", subscript <$> tok)
, ("textbackslash", lit "\\")
, ("backslash", lit "\\")
+ , ("slash", lit "/")
, ("textbf", strong <$> tok)
, ("ldots", lit "…")
, ("dots", lit "…")
@@ -644,11 +645,7 @@ inlineText :: LP Inlines
inlineText = str <$> many1 inlineChar
inlineChar :: LP Char
-inlineChar = satisfy $ \c ->
- not (c == '\\' || c == '$' || c == '%' || c == '^' || c == '_' ||
- c == '&' || c == '~' || c == '#' || c == '{' || c == '}' ||
- c == '^' || c == '\'' || c == '`' || c == '-' || c == ']' ||
- c == ' ' || c == '\t' || c == '\n' )
+inlineChar = noneOf "\\$%^_&~#{}^'`-[] \t\n"
environment :: LP Blocks
environment = do
@@ -745,6 +742,9 @@ environments = M.fromList
, ("lstlisting", codeBlock <$> (verbEnv "lstlisting"))
, ("minted", liftA2 (\l c -> codeBlockWith ("",[l],[]) c)
(grouped (many1 $ satisfy (/= '}'))) (verbEnv "minted"))
+ , ("obeylines", parseFromString
+ (para . trimInlines . mconcat <$> many inline) =<<
+ intercalate "\\\\\n" . lines <$> verbEnv "obeylines")
, ("displaymath", mathEnv Nothing "displaymath")
, ("equation", mathEnv Nothing "equation")
, ("equation*", mathEnv Nothing "equation*")
@@ -801,7 +801,9 @@ descItem = do
return (ils, [bs])
env :: String -> LP a -> LP a
-env name p = p <* (controlSeq "end" *> braced >>= guard . (== name))
+env name p = p <*
+ (try (controlSeq "end" *> braced >>= guard . (== name))
+ <?> ("\\end{" ++ name ++ "}"))
listenv :: String -> LP a -> LP a
listenv name p = try $ do
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 2407e137c..1c2cc12f1 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -1,6 +1,4 @@
{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
-{-# LANGUAGE FlexibleInstances, TypeSynonymInstances,
- GeneralizedNewtypeDeriving #-}
{-
Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
@@ -1294,18 +1292,6 @@ inlinesBetween start end =
where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
innerSpace = try $ whitespace >>~ notFollowedBy' end
--- This is used to prevent exponential blowups for things like:
--- a**a*a**a*a**a*a**a*a**a*a**a*a**
-nested :: Parser [Char] ParserState a
- -> Parser [Char] ParserState a
-nested p = do
- nestlevel <- stateMaxNestingLevel `fmap` getState
- guard $ nestlevel > 0
- updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 }
- res <- p
- updateState $ \st -> st{ stateMaxNestingLevel = nestlevel }
- return res
-
emph :: Parser [Char] ParserState (F Inlines)
emph = fmap B.emph <$> nested
(inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd)
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
new file mode 100644
index 000000000..7936be38b
--- /dev/null
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -0,0 +1,594 @@
+{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7
+{-
+ Copyright (C) 2012 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.MediaWiki
+ Copyright : Copyright (C) 2012 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of mediawiki text to 'Pandoc' document.
+-}
+{-
+TODO:
+_ correctly handle tables within tables
+_ parse templates?
+-}
+module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where
+
+import Text.Pandoc.Definition
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>))
+import Text.Pandoc.Options
+import Text.Pandoc.Readers.HTML ( htmlTag, isBlockTag, isCommentTag )
+import Text.Pandoc.XML ( fromEntities )
+import Text.Pandoc.Parsing hiding ( nested )
+import Text.Pandoc.Generic ( bottomUp )
+import Text.Pandoc.Shared ( stripTrailingNewlines, safeRead )
+import Data.Monoid (mconcat, mempty)
+import Control.Applicative ((<$>), (<*), (*>), (<$))
+import Control.Monad
+import Data.List (intersperse, intercalate, isPrefixOf )
+import Text.HTML.TagSoup
+import Data.Sequence (viewl, ViewL(..), (<|))
+import Data.Char (isDigit)
+
+-- | Read mediawiki from an input string and return a Pandoc document.
+readMediaWiki :: ReaderOptions -- ^ Reader options
+ -> String -- ^ String to parse (assuming @'\n'@ line endings)
+ -> Pandoc
+readMediaWiki opts s =
+ case runParser parseMediaWiki MWState{ mwOptions = opts
+ , mwMaxNestingLevel = 4
+ , mwNextLinkNumber = 1
+ , mwCategoryLinks = []
+ }
+ "source" (s ++ "\n") of
+ Left err' -> error $ "\nError:\n" ++ show err'
+ Right result -> result
+
+data MWState = MWState { mwOptions :: ReaderOptions
+ , mwMaxNestingLevel :: Int
+ , mwNextLinkNumber :: Int
+ , mwCategoryLinks :: [Inlines]
+ }
+
+type MWParser = Parser [Char] MWState
+
+--
+-- auxiliary functions
+--
+
+-- This is used to prevent exponential blowups for things like:
+-- ''a'''a''a'''a''a'''a''a'''a
+nested :: MWParser a -> MWParser a
+nested p = do
+ nestlevel <- mwMaxNestingLevel `fmap` getState
+ guard $ nestlevel > 0
+ updateState $ \st -> st{ mwMaxNestingLevel = mwMaxNestingLevel st - 1 }
+ res <- p
+ updateState $ \st -> st{ mwMaxNestingLevel = nestlevel }
+ return res
+
+specialChars :: [Char]
+specialChars = "'[]<=&*{}|\""
+
+spaceChars :: [Char]
+spaceChars = " \n\t"
+
+sym :: String -> MWParser ()
+sym s = () <$ try (string s)
+
+newBlockTags :: [String]
+newBlockTags = ["haskell","syntaxhighlight","source","gallery","references"]
+
+isBlockTag' :: Tag String -> Bool
+isBlockTag' tag@(TagOpen t _) = (isBlockTag tag || t `elem` newBlockTags) &&
+ t `notElem` eitherBlockOrInline
+isBlockTag' tag@(TagClose t) = (isBlockTag tag || t `elem` newBlockTags) &&
+ t `notElem` eitherBlockOrInline
+isBlockTag' tag = isBlockTag tag
+
+isInlineTag' :: Tag String -> Bool
+isInlineTag' (TagComment _) = True
+isInlineTag' t = not (isBlockTag' t)
+
+eitherBlockOrInline :: [String]
+eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
+ "map", "area", "object"]
+
+htmlComment :: MWParser ()
+htmlComment = () <$ htmlTag isCommentTag
+
+inlinesInTags :: String -> MWParser Inlines
+inlinesInTags tag = try $ do
+ (_,raw) <- htmlTag (~== TagOpen tag [])
+ if '/' `elem` raw -- self-closing tag
+ then return mempty
+ else trimInlines . mconcat <$>
+ manyTill inline (htmlTag (~== TagClose tag))
+
+blocksInTags :: String -> MWParser Blocks
+blocksInTags tag = try $ do
+ (_,raw) <- htmlTag (~== TagOpen tag [])
+ if '/' `elem` raw -- self-closing tag
+ then return mempty
+ else mconcat <$> manyTill block (htmlTag (~== TagClose tag))
+
+charsInTags :: String -> MWParser [Char]
+charsInTags tag = try $ do
+ (_,raw) <- htmlTag (~== TagOpen tag [])
+ if '/' `elem` raw -- self-closing tag
+ then return ""
+ else innerText . parseTags <$>
+ manyTill anyChar (htmlTag (~== TagClose tag))
+
+--
+-- main parser
+--
+
+parseMediaWiki :: MWParser Pandoc
+parseMediaWiki = do
+ bs <- mconcat <$> many block
+ spaces
+ eof
+ categoryLinks <- reverse . mwCategoryLinks <$> getState
+ let categories = if null categoryLinks
+ then mempty
+ else B.para $ mconcat $ intersperse B.space categoryLinks
+ return $ B.doc $ bs <> categories
+
+--
+-- block parsers
+--
+
+block :: MWParser Blocks
+block = mempty <$ skipMany1 blankline
+ <|> table
+ <|> header
+ <|> hrule
+ <|> orderedList
+ <|> bulletList
+ <|> definitionList
+ <|> mempty <$ try (spaces *> htmlComment)
+ <|> preformatted
+ <|> blockTag
+ <|> (B.rawBlock "mediawiki" <$> template)
+ <|> para
+
+para :: MWParser Blocks
+para = B.para . trimInlines . mconcat <$> many1 inline
+
+table :: MWParser Blocks
+table = do
+ tableStart
+ styles <- manyTill anyChar newline
+ let tableWidth = case lookup "width" $ parseAttrs styles of
+ Just w -> maybe 1.0 id $ parseWidth w
+ Nothing -> 1.0
+ caption <- option mempty tableCaption
+ optional rowsep
+ hasheader <- option False $ True <$ (lookAhead (char '!'))
+ (cellspecs',hdr) <- unzip <$> tableRow
+ let widths = map ((tableWidth *) . snd) cellspecs'
+ let restwidth = tableWidth - sum widths
+ let zerocols = length $ filter (==0.0) widths
+ let defaultwidth = if zerocols == 0 || zerocols == length widths
+ then 0.0
+ else restwidth / fromIntegral zerocols
+ let widths' = map (\w -> if w == 0 then defaultwidth else w) widths
+ let cellspecs = zip (map fst cellspecs') widths'
+ rows' <- many $ try $ rowsep *> (map snd <$> tableRow)
+ tableEnd
+ let cols = length hdr
+ let (headers,rows) = if hasheader
+ then (hdr, rows')
+ else (replicate cols mempty, hdr:rows')
+ return $ B.table caption cellspecs headers rows
+
+parseAttrs :: String -> [(String,String)]
+parseAttrs s = case parse (many parseAttr) "attributes" s of
+ Right r -> r
+ Left _ -> []
+
+parseAttr :: Parser String () (String, String)
+parseAttr = try $ do
+ skipMany spaceChar
+ k <- many1 letter
+ char '='
+ char '"'
+ v <- many1Till anyChar (char '"')
+ return (k,v)
+
+tableStart :: MWParser ()
+tableStart = try $ guardColumnOne *> sym "{|"
+
+tableEnd :: MWParser ()
+tableEnd = try $ guardColumnOne *> sym "|}" <* blanklines
+
+rowsep :: MWParser ()
+rowsep = try $ guardColumnOne *> sym "|-" <* blanklines
+
+cellsep :: MWParser ()
+cellsep = try $
+ (guardColumnOne <*
+ ( (char '|' <* notFollowedBy (oneOf "-}+"))
+ <|> (char '!')
+ )
+ )
+ <|> (() <$ try (string "||"))
+ <|> (() <$ try (string "!!"))
+
+tableCaption :: MWParser Inlines
+tableCaption = try $ do
+ guardColumnOne
+ sym "|+"
+ skipMany spaceChar
+ res <- manyTill anyChar newline >>= parseFromString (many inline)
+ return $ trimInlines $ mconcat res
+
+tableRow :: MWParser [((Alignment, Double), Blocks)]
+tableRow = try $ many tableCell
+
+tableCell :: MWParser ((Alignment, Double), Blocks)
+tableCell = try $ do
+ cellsep
+ skipMany spaceChar
+ attrs <- option [] $ try $ parseAttrs <$>
+ manyTill (satisfy (/='\n')) (char '|' <* notFollowedBy (char '|'))
+ skipMany spaceChar
+ ls <- concat <$> many (notFollowedBy (cellsep <|> rowsep <|> tableEnd) *>
+ ((snd <$> withRaw table) <|> count 1 anyChar))
+ bs <- parseFromString (mconcat <$> many block) ls
+ let align = case lookup "align" attrs of
+ Just "left" -> AlignLeft
+ Just "right" -> AlignRight
+ Just "center" -> AlignCenter
+ _ -> AlignDefault
+ let width = case lookup "width" attrs of
+ Just xs -> maybe 0.0 id $ parseWidth xs
+ Nothing -> 0.0
+ return ((align, width), bs)
+
+parseWidth :: String -> Maybe Double
+parseWidth s =
+ case reverse s of
+ ('%':ds) | all isDigit ds -> safeRead ('0':'.':reverse ds)
+ _ -> Nothing
+
+template :: MWParser String
+template = try $ do
+ string "{{"
+ notFollowedBy (char '{')
+ let chunk = template <|> variable <|> many1 (noneOf "{}") <|> count 1 anyChar
+ contents <- manyTill chunk (try $ string "}}")
+ return $ "{{" ++ concat contents ++ "}}"
+
+blockTag :: MWParser Blocks
+blockTag = do
+ (tag, _) <- lookAhead $ htmlTag isBlockTag'
+ case tag of
+ TagOpen "blockquote" _ -> B.blockQuote <$> blocksInTags "blockquote"
+ TagOpen "pre" _ -> B.codeBlock . trimCode <$> charsInTags "pre"
+ TagOpen "syntaxhighlight" attrs -> syntaxhighlight "syntaxhighlight" attrs
+ TagOpen "source" attrs -> syntaxhighlight "source" attrs
+ TagOpen "haskell" _ -> B.codeBlockWith ("",["haskell"],[]) . trimCode <$>
+ charsInTags "haskell"
+ TagOpen "gallery" _ -> blocksInTags "gallery"
+ TagOpen "p" _ -> mempty <$ htmlTag (~== tag)
+ TagClose "p" -> mempty <$ htmlTag (~== tag)
+ _ -> B.rawBlock "html" . snd <$> htmlTag (~== tag)
+
+trimCode :: String -> String
+trimCode ('\n':xs) = stripTrailingNewlines xs
+trimCode xs = stripTrailingNewlines xs
+
+syntaxhighlight :: String -> [Attribute String] -> MWParser Blocks
+syntaxhighlight tag attrs = try $ do
+ let mblang = lookup "lang" attrs
+ let mbstart = lookup "start" attrs
+ let mbline = lookup "line" attrs
+ let classes = maybe [] (:[]) mblang ++ maybe [] (const ["numberLines"]) mbline
+ let kvs = maybe [] (\x -> [("startFrom",x)]) mbstart
+ contents <- charsInTags tag
+ return $ B.codeBlockWith ("",classes,kvs) $ trimCode contents
+
+hrule :: MWParser Blocks
+hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline)
+
+guardColumnOne :: MWParser ()
+guardColumnOne = getPosition >>= \pos -> guard (sourceColumn pos == 1)
+
+preformatted :: MWParser Blocks
+preformatted = try $ do
+ guardColumnOne
+ char ' '
+ let endline' = B.linebreak <$ (try $ newline <* char ' ')
+ let whitespace' = B.str <$> many1 ('\160' <$ spaceChar)
+ let spToNbsp ' ' = '\160'
+ spToNbsp x = x
+ let nowiki' = mconcat . intersperse B.linebreak . map B.str .
+ lines . fromEntities . map spToNbsp <$> try
+ (htmlTag (~== TagOpen "nowiki" []) *>
+ manyTill anyChar (htmlTag (~== TagClose "nowiki")))
+ let inline' = whitespace' <|> endline' <|> nowiki' <|> inline
+ let strToCode (Str s) = Code ("",[],[]) s
+ strToCode x = x
+ B.para . bottomUp strToCode . mconcat <$> many1 inline'
+
+header :: MWParser Blocks
+header = try $ do
+ guardColumnOne
+ eqs <- many1 (char '=')
+ let lev = length eqs
+ guard $ lev <= 6
+ contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=')
+ return $ B.header lev contents
+
+bulletList :: MWParser Blocks
+bulletList = B.bulletList <$>
+ ( many1 (listItem '*')
+ <|> (htmlTag (~== TagOpen "ul" []) *> spaces *> many (listItem '*' <|> li) <*
+ optional (htmlTag (~== TagClose "ul"))) )
+
+orderedList :: MWParser Blocks
+orderedList =
+ (B.orderedList <$> many1 (listItem '#'))
+ <|> (B.orderedList <$> (htmlTag (~== TagOpen "ul" []) *> spaces *>
+ many (listItem '#' <|> li) <*
+ optional (htmlTag (~== TagClose "ul"))))
+ <|> do (tag,_) <- htmlTag (~== TagOpen "ol" [])
+ spaces
+ items <- many (listItem '#' <|> li)
+ optional (htmlTag (~== TagClose "ol"))
+ let start = maybe 1 id $ safeRead $ fromAttrib "start" tag
+ return $ B.orderedListWith (start, DefaultStyle, DefaultDelim) items
+
+definitionList :: MWParser Blocks
+definitionList = B.definitionList <$> many1 defListItem
+
+defListItem :: MWParser (Inlines, [Blocks])
+defListItem = try $ do
+ terms <- mconcat . intersperse B.linebreak <$> many defListTerm
+ -- we allow dd with no dt, or dt with no dd
+ defs <- if B.isNull terms
+ then many1 $ listItem ':'
+ else many $ listItem ':'
+ return (terms, defs)
+
+defListTerm :: MWParser Inlines
+defListTerm = char ';' >> skipMany spaceChar >> manyTill anyChar newline >>=
+ parseFromString (trimInlines . mconcat <$> many inline)
+
+listStart :: Char -> MWParser ()
+listStart c = char c *> notFollowedBy listStartChar
+
+listStartChar :: MWParser Char
+listStartChar = oneOf "*#;:"
+
+anyListStart :: MWParser Char
+anyListStart = char '*'
+ <|> char '#'
+ <|> char ':'
+ <|> char ';'
+
+li :: MWParser Blocks
+li = lookAhead (htmlTag (~== TagOpen "li" [])) *>
+ (firstParaToPlain <$> blocksInTags "li") <* spaces
+
+listItem :: Char -> MWParser Blocks
+listItem c = try $ do
+ extras <- many (try $ char c <* lookAhead listStartChar)
+ if null extras
+ then listItem' c
+ else do
+ skipMany spaceChar
+ first <- concat <$> manyTill listChunk newline
+ rest <- many
+ (try $ string extras *> (concat <$> manyTill listChunk newline))
+ contents <- parseFromString (many1 $ listItem' c)
+ (unlines (first : rest))
+ case c of
+ '*' -> return $ B.bulletList contents
+ '#' -> return $ B.orderedList contents
+ ':' -> return $ B.definitionList [(mempty, contents)]
+ _ -> mzero
+
+-- The point of this is to handle stuff like
+-- * {{cite book
+-- | blah
+-- | blah
+-- }}
+-- * next list item
+-- which seems to be valid mediawiki.
+listChunk :: MWParser String
+listChunk = template <|> count 1 anyChar
+
+listItem' :: Char -> MWParser Blocks
+listItem' c = try $ do
+ listStart c
+ skipMany spaceChar
+ first <- concat <$> manyTill listChunk newline
+ rest <- many (try $ char c *> lookAhead listStartChar *>
+ (concat <$> manyTill listChunk newline))
+ parseFromString (firstParaToPlain . mconcat <$> many1 block)
+ $ unlines $ first : rest
+
+firstParaToPlain :: Blocks -> Blocks
+firstParaToPlain contents =
+ case viewl (B.unMany contents) of
+ (Para xs) :< ys -> B.Many $ (Plain xs) <| ys
+ _ -> contents
+
+--
+-- inline parsers
+--
+
+inline :: MWParser Inlines
+inline = whitespace
+ <|> url
+ <|> str
+ <|> doubleQuotes
+ <|> strong
+ <|> emph
+ <|> image
+ <|> internalLink
+ <|> externalLink
+ <|> inlineTag
+ <|> B.singleton <$> charRef
+ <|> inlineHtml
+ <|> (B.rawInline "mediawiki" <$> variable)
+ <|> (B.rawInline "mediawiki" <$> template)
+ <|> special
+
+str :: MWParser Inlines
+str = B.str <$> many1 (noneOf $ specialChars ++ spaceChars)
+
+variable :: MWParser String
+variable = try $ do
+ string "{{{"
+ contents <- manyTill anyChar (try $ string "}}}")
+ return $ "{{{" ++ contents ++ "}}}"
+
+inlineTag :: MWParser Inlines
+inlineTag = do
+ (tag, _) <- lookAhead $ htmlTag isInlineTag'
+ case tag of
+ TagOpen "ref" _ -> B.note . B.plain <$> inlinesInTags "ref"
+ TagOpen "nowiki" _ -> try $ do
+ (_,raw) <- htmlTag (~== tag)
+ if '/' `elem` raw
+ then return mempty
+ else B.text . fromEntities <$>
+ manyTill anyChar (htmlTag (~== TagClose "nowiki"))
+ TagOpen "br" _ -> B.linebreak <$ (htmlTag (~== TagOpen "br" []) -- will get /> too
+ *> optional blankline)
+ TagOpen "strike" _ -> B.strikeout <$> inlinesInTags "strike"
+ TagOpen "del" _ -> B.strikeout <$> inlinesInTags "del"
+ TagOpen "sub" _ -> B.subscript <$> inlinesInTags "sub"
+ TagOpen "sup" _ -> B.superscript <$> inlinesInTags "sup"
+ TagOpen "math" _ -> B.math <$> charsInTags "math"
+ TagOpen "code" _ -> B.code <$> charsInTags "code"
+ TagOpen "tt" _ -> B.code <$> charsInTags "tt"
+ TagOpen "hask" _ -> B.codeWith ("",["haskell"],[]) <$> charsInTags "hask"
+ _ -> B.rawInline "html" . snd <$> htmlTag (~== tag)
+
+special :: MWParser Inlines
+special = B.str <$> count 1 (notFollowedBy' (htmlTag isBlockTag') *>
+ oneOf specialChars)
+
+inlineHtml :: MWParser Inlines
+inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag'
+
+whitespace :: MWParser Inlines
+whitespace = B.space <$ (skipMany1 spaceChar <|> endline <|> htmlComment)
+
+endline :: MWParser ()
+endline = () <$ try (newline <*
+ notFollowedBy blankline <*
+ notFollowedBy' hrule <*
+ notFollowedBy tableStart <*
+ notFollowedBy' header <*
+ notFollowedBy anyListStart)
+
+image :: MWParser Inlines
+image = try $ do
+ sym "[["
+ sym "File:"
+ fname <- many1 (noneOf "|]")
+ _ <- many (try $ char '|' *> imageOption)
+ caption <- (B.str fname <$ sym "]]")
+ <|> try (char '|' *> (mconcat <$> manyTill inline (sym "]]")))
+ return $ B.image fname "image" caption
+
+imageOption :: MWParser String
+imageOption =
+ try (oneOfStrings [ "border", "thumbnail", "frameless"
+ , "thumb", "upright", "left", "right"
+ , "center", "none", "baseline", "sub"
+ , "super", "top", "text-top", "middle"
+ , "bottom", "text-bottom" ])
+ <|> try (string "frame")
+ <|> try (many1 (oneOf "x0123456789") <* string "px")
+ <|> try (oneOfStrings ["link=","alt=","page=","class="] <* many (noneOf "|]"))
+
+internalLink :: MWParser Inlines
+internalLink = try $ do
+ sym "[["
+ let addUnderscores x = let (pref,suff) = break (=='#') x
+ in pref ++ intercalate "_" (words suff)
+ pagename <- unwords . words <$> many (noneOf "|]")
+ label <- option (B.text pagename) $ char '|' *>
+ ( (mconcat <$> many1 (notFollowedBy (char ']') *> inline))
+ -- the "pipe trick"
+ -- [[Help:Contents|] -> "Contents"
+ <|> (return $ B.text $ drop 1 $ dropWhile (/=':') pagename) )
+ sym "]]"
+ linktrail <- B.text <$> many letter
+ let link = B.link (addUnderscores pagename) "wikilink" (label <> linktrail)
+ if "Category:" `isPrefixOf` pagename
+ then do
+ updateState $ \st -> st{ mwCategoryLinks = link : mwCategoryLinks st }
+ return mempty
+ else return link
+
+externalLink :: MWParser Inlines
+externalLink = try $ do
+ char '['
+ (_, src) <- uri
+ lab <- try (trimInlines . mconcat <$>
+ (skipMany1 spaceChar *> manyTill inline (char ']')))
+ <|> do char ']'
+ num <- mwNextLinkNumber <$> getState
+ updateState $ \st -> st{ mwNextLinkNumber = num + 1 }
+ return $ B.str $ show num
+ return $ B.link src "" lab
+
+url :: MWParser Inlines
+url = do
+ (orig, src) <- uri
+ return $ B.link src "" (B.str orig)
+
+-- | Parses a list of inlines between start and end delimiters.
+inlinesBetween :: (Show b) => MWParser a -> MWParser b -> MWParser Inlines
+inlinesBetween start end =
+ (trimInlines . mconcat) <$> try (start >> many1Till inner end)
+ where inner = innerSpace <|> (notFollowedBy' (() <$ whitespace) >> inline)
+ innerSpace = try $ whitespace >>~ notFollowedBy' end
+
+emph :: MWParser Inlines
+emph = B.emph <$> nested (inlinesBetween start end)
+ where start = sym "''" >> lookAhead nonspaceChar
+ end = try $ notFollowedBy' (() <$ strong) >> sym "''"
+
+strong :: MWParser Inlines
+strong = B.strong <$> nested (inlinesBetween start end)
+ where start = sym "'''" >> lookAhead nonspaceChar
+ end = try $ sym "'''"
+
+doubleQuotes :: MWParser Inlines
+doubleQuotes = B.doubleQuoted . trimInlines . mconcat <$> try
+ ((getState >>= guard . readerSmart . mwOptions) *>
+ openDoubleQuote *> manyTill inline closeDoubleQuote )
+ where openDoubleQuote = char '"' <* lookAhead alphaNum
+ closeDoubleQuote = char '"' <* notFollowedBy alphaNum
+
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
index a26b1623d..74653efcf 100644
--- a/src/Text/Pandoc/Readers/RST.hs
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -757,7 +757,7 @@ simpleTableHeader headless = try $ do
rawContent <- if headless
then return ""
else simpleTableSep '=' >> anyLine
- dashes <- simpleDashedLines '='
+ dashes <- simpleDashedLines '=' <|> simpleDashedLines '-'
newline
let lines' = map snd dashes
let indices = scanl (+) 0 lines'
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
index 89f281ae8..dc95d9a56 100644
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -47,7 +47,6 @@ Left to be implemented:
TODO : refactor common patterns across readers :
- autolink
- - smartPunctuation
- more ...
-}
@@ -62,6 +61,7 @@ import Text.Pandoc.Parsing
import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag )
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock )
import Text.HTML.TagSoup.Match
+import Data.List ( intercalate )
import Data.Char ( digitToInt, isUpper )
import Control.Monad ( guard, liftM )
import Control.Applicative ((<$>), (*>), (<*))
@@ -412,7 +412,7 @@ note = try $ do
-- | Special chars
markupChars :: [Char]
-markupChars = "\\[]*#_@~-+^|%="
+markupChars = "\\*#_@~-+^|%=[]"
-- | Break strings on following chars. Space tab and newline break for
-- inlines breaking. Open paren breaks for mark. Quote, dash and dot
@@ -427,13 +427,15 @@ wordBoundaries = markupChars ++ stringBreakers
-- | Parse a hyphened sequence of words
hyphenedWords :: Parser [Char] ParserState String
-hyphenedWords = try $ do
+hyphenedWords = intercalate "-" <$> sepBy1 wordChunk (char '-')
+
+wordChunk :: Parser [Char] ParserState String
+wordChunk = try $ do
hd <- noneOf wordBoundaries
tl <- many ( (noneOf wordBoundaries) <|>
- try (oneOf markupChars <* lookAhead (noneOf wordBoundaries) ) )
- let wd = hd:tl
- option wd $ try $
- (\r -> concat [wd, "-", r]) <$> (char '-' *> hyphenedWords)
+ try (notFollowedBy' note *> oneOf markupChars
+ <* lookAhead (noneOf wordBoundaries) ) )
+ return $ hd:tl
-- | Any string
str :: Parser [Char] ParserState Inline