aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc
diff options
context:
space:
mode:
authorAlexander <ilabdsf@gmail.com>2019-01-06 23:06:32 +0000
committerJohn MacFarlane <jgm@berkeley.edu>2019-01-06 15:06:32 -0800
commit40c30a9d88101e2a00b904597c6ad4d8f8bce60b (patch)
treee18e2a5474c0c97b5ca8b40915fa92ac1beae985 /src/Text/Pandoc
parenta5f5002eefd43820b1f54191b0cf3cbd743eca0a (diff)
downloadpandoc-40c30a9d88101e2a00b904597c6ad4d8f8bce60b.tar.gz
Add DokuWiki reader (#5108)
Closes #1792
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r--src/Text/Pandoc/Readers.hs3
-rw-r--r--src/Text/Pandoc/Readers/DokuWiki.hs529
2 files changed, 532 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 630b8b858..434725887 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -47,6 +47,7 @@ module Text.Pandoc.Readers
, readMarkdown
, readCommonMark
, readCreole
+ , readDokuWiki
, readMediaWiki
, readVimwiki
, readRST
@@ -86,6 +87,7 @@ import Text.Pandoc.Readers.CommonMark
import Text.Pandoc.Readers.Creole
import Text.Pandoc.Readers.DocBook
import Text.Pandoc.Readers.Docx
+import Text.Pandoc.Readers.DokuWiki
import Text.Pandoc.Readers.EPUB
import Text.Pandoc.Readers.FB2
import Text.Pandoc.Readers.Haddock
@@ -123,6 +125,7 @@ readers = [ ("native" , TextReader readNative)
,("markdown_mmd", TextReader readMarkdown)
,("commonmark" , TextReader readCommonMark)
,("creole" , TextReader readCreole)
+ ,("dokuwiki" , TextReader readDokuWiki)
,("gfm" , TextReader readCommonMark)
,("rst" , TextReader readRST)
,("mediawiki" , TextReader readMediaWiki)
diff --git a/src/Text/Pandoc/Readers/DokuWiki.hs b/src/Text/Pandoc/Readers/DokuWiki.hs
new file mode 100644
index 000000000..b9ef96675
--- /dev/null
+++ b/src/Text/Pandoc/Readers/DokuWiki.hs
@@ -0,0 +1,529 @@
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TupleSections #-}
+
+{-
+ Copyright (C) 2018-2019 Alexander Krotov <ilabdsf@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.DokuWiki
+ Copyright : Copyright (C) 2018 Alexander Krotov
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Alexander Krotov <ilabdsf@gmail.com>
+ Stability : alpha
+ Portability : portable
+
+Conversion of DokuWiki text to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.DokuWiki (readDokuWiki) where
+
+import Prelude
+import Control.Monad
+import Control.Monad.Except (throwError)
+import Data.Char (isAlphaNum, isDigit)
+import qualified Data.Foldable as F
+import Data.List (intercalate, transpose, isPrefixOf, isSuffixOf)
+import Data.List.Split (splitOn)
+import Data.Maybe (fromMaybe, catMaybes)
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Class (PandocMonad (..))
+import Text.Pandoc.Definition
+import Text.Pandoc.Error (PandocError (PandocParsecError))
+import Text.Pandoc.Options
+import Text.Pandoc.Parsing hiding (enclosed, nested)
+import Text.Pandoc.Shared (crFilter, trim, underlineSpan)
+
+-- | Read DokuWiki from an input string and return a Pandoc document.
+readDokuWiki :: PandocMonad m
+ => ReaderOptions
+ -> Text
+ -> m Pandoc
+readDokuWiki opts s = do
+ let input = crFilter s
+ res <- runParserT parseDokuWiki def {stateOptions = opts } "source" input
+ case res of
+ Left e -> throwError $ PandocParsecError (T.unpack input) e
+ Right d -> return d
+
+type DWParser = ParserT Text ParserState
+
+-- * Utility functions
+
+-- | Parse end-of-line, which can be either a newline or end-of-file.
+eol :: Stream s m Char => ParserT s st m ()
+eol = void newline <|> eof
+
+nested :: PandocMonad m => DWParser m a -> DWParser m a
+nested p = do
+ nestlevel <- stateMaxNestingLevel <$> getState
+ guard $ nestlevel > 0
+ updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 }
+ res <- p
+ updateState $ \st -> st{ stateMaxNestingLevel = nestlevel }
+ return res
+
+guardColumnOne :: PandocMonad m => DWParser m ()
+guardColumnOne = getPosition >>= \pos -> guard (sourceColumn pos == 1)
+
+-- | Parse DokuWiki document.
+parseDokuWiki :: PandocMonad m => DWParser m Pandoc
+parseDokuWiki =
+ B.doc . mconcat <$> many block <* spaces <* eof
+
+-- | Parse <code> and <file> attributes
+codeLanguage :: PandocMonad m => DWParser m (String, [String], [(String, String)])
+codeLanguage = try $ do
+ rawLang <- option "-" (spaceChar *> manyTill anyChar (lookAhead (spaceChar <|> char '>')))
+ let attr = case rawLang of
+ "-" -> []
+ l -> [l]
+ return ("", attr, [])
+
+-- | Generic parser for <code> and <file> tags
+codeTag :: PandocMonad m
+ => ((String, [String], [(String, String)]) -> String -> a)
+ -> String
+ -> DWParser m a
+codeTag f tag = try $ f
+ <$ char '<'
+ <* string tag
+ <*> codeLanguage
+ <* manyTill anyChar (char '>')
+ <* optional (manyTill spaceChar eol)
+ <*> manyTill anyChar (try $ string "</" <* string tag <* char '>')
+
+-- * Inline parsers
+
+-- | Parse any inline element but softbreak.
+inline' :: PandocMonad m => DWParser m B.Inlines
+inline' = whitespace
+ <|> br
+ <|> bold
+ <|> italic
+ <|> underlined
+ <|> nowiki
+ <|> percent
+ <|> link
+ <|> image
+ <|> monospaced
+ <|> subscript
+ <|> superscript
+ <|> deleted
+ <|> footnote
+ <|> inlineCode
+ <|> inlineFile
+ <|> inlineHtml
+ <|> inlinePhp
+ <|> autoLink
+ <|> autoEmail
+ <|> notoc
+ <|> nocache
+ <|> str
+ <|> symbol
+ <?> "inline"
+
+-- | Parse any inline element, including soft break.
+inline :: PandocMonad m => DWParser m B.Inlines
+inline = endline <|> inline'
+
+endline :: PandocMonad m => DWParser m B.Inlines
+endline = try $ B.softbreak <$ skipMany spaceChar <* linebreak
+
+whitespace :: PandocMonad m => DWParser m B.Inlines
+whitespace = try $ B.space <$ skipMany1 spaceChar
+
+br :: PandocMonad m => DWParser m B.Inlines
+br = try $ B.linebreak <$ string "\\\\" <* space
+
+linebreak :: PandocMonad m => DWParser m B.Inlines
+linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline)
+ where lastNewline = mempty <$ eof
+ innerNewline = pure B.space
+
+between :: (Monoid c, PandocMonad m, Show b)
+ => DWParser m a -> DWParser m b -> (DWParser m b -> DWParser m c)
+ -> DWParser m c
+between start end p =
+ mconcat <$> try (start >> notFollowedBy whitespace >> many1Till (p end) end)
+
+enclosed :: (Monoid b, PandocMonad m, Show a)
+ => DWParser m a -> (DWParser m a -> DWParser m b) -> DWParser m b
+enclosed sep p = between sep (try sep) p
+
+nestedInlines :: (Show a, PandocMonad m)
+ => DWParser m a -> DWParser m B.Inlines
+nestedInlines end = innerSpace <|> nestedInline
+ where
+ innerSpace = try $ whitespace <* notFollowedBy end
+ nestedInline = notFollowedBy whitespace >> nested inline
+
+bold :: PandocMonad m => DWParser m B.Inlines
+bold = try $ B.strong <$> enclosed (string "**") nestedInlines
+
+italic :: PandocMonad m => DWParser m B.Inlines
+italic = try $ B.emph <$> enclosed (string "//") nestedInlines
+
+underlined :: PandocMonad m => DWParser m B.Inlines
+underlined = try $ underlineSpan <$> enclosed (string "__") nestedInlines
+
+nowiki :: PandocMonad m => DWParser m B.Inlines
+nowiki = try $ B.text <$ string "<nowiki>" <*> manyTill anyChar (try $ string "</nowiki>")
+
+percent :: PandocMonad m => DWParser m B.Inlines
+percent = try $ B.text <$> enclosed (string "%%") nestedString
+
+nestedString :: (Show a, PandocMonad m)
+ => DWParser m a -> DWParser m String
+nestedString end = innerSpace <|> count 1 nonspaceChar
+ where
+ innerSpace = try $ many1 spaceChar <* notFollowedBy end
+
+monospaced :: PandocMonad m => DWParser m B.Inlines
+monospaced = try $ B.code <$> enclosed (string "''") nestedString
+
+subscript :: PandocMonad m => DWParser m B.Inlines
+subscript = try $ B.subscript <$> between (string "<sub>") (try $ string "</sub>") nestedInlines
+
+superscript :: PandocMonad m => DWParser m B.Inlines
+superscript = try $ B.superscript <$> between (string "<sup>") (try $ string "</sup>") nestedInlines
+
+deleted :: PandocMonad m => DWParser m B.Inlines
+deleted = try $ B.strikeout <$> between (string "<del>") (try $ string "</del>") nestedInlines
+
+-- | Parse a footnote.
+footnote :: PandocMonad m => DWParser m B.Inlines
+footnote = try $ B.note . B.para <$> between (string "((") (try $ string "))") nestedInlines
+
+inlineCode :: PandocMonad m => DWParser m B.Inlines
+inlineCode = codeTag B.codeWith "code"
+
+inlineFile :: PandocMonad m => DWParser m B.Inlines
+inlineFile = codeTag B.codeWith "file"
+
+inlineHtml :: PandocMonad m => DWParser m B.Inlines
+inlineHtml = try $ B.rawInline "html" <$ string "<html>" <*> manyTill anyChar (try $ string "</html>")
+
+inlinePhp :: PandocMonad m => DWParser m B.Inlines
+inlinePhp = try $ B.codeWith ("", ["php"], []) <$ string "<php>" <*> manyTill anyChar (try $ string "</php>")
+
+makeLink :: (String, String) -> B.Inlines
+makeLink (text, url) = B.link url "" $ B.str text
+
+autoEmail :: PandocMonad m => DWParser m B.Inlines
+autoEmail = try $ do
+ state <- getState
+ guard $ stateAllowLinks state
+ makeLink <$ char '<' <*> emailAddress <* char '>'
+
+autoLink :: PandocMonad m => DWParser m B.Inlines
+autoLink = try $ do
+ state <- getState
+ guard $ stateAllowLinks state
+ (text, url) <- uri
+ guard $ checkLink (last url)
+ return $ makeLink (text, url)
+ where
+ checkLink c
+ | c == '/' = True
+ | otherwise = isAlphaNum c
+
+notoc :: PandocMonad m => DWParser m B.Inlines
+notoc = try $ mempty <$ string "~~NOTOC~~"
+
+nocache :: PandocMonad m => DWParser m B.Inlines
+nocache = try $ mempty <$ string "~~NOCACHE~~"
+
+str :: PandocMonad m => DWParser m B.Inlines
+str = B.str <$> (many1 alphaNum <|> count 1 characterReference)
+
+symbol :: PandocMonad m => DWParser m B.Inlines
+symbol = B.str <$> count 1 nonspaceChar
+
+link :: PandocMonad m => DWParser m B.Inlines
+link = try $ do
+ st <- getState
+ guard $ stateAllowLinks st
+ setState $ st{ stateAllowLinks = False }
+ l <- linkText
+ setState $ st{ stateAllowLinks = True }
+ return l
+
+isExternalLink :: String -> Bool
+isExternalLink s =
+ case dropWhile (\c -> isAlphaNum c || (c `elem` ['-', '.', '+'])) s of
+ (':':'/':'/':_) -> True
+ _ -> False
+
+isAbsolutePath :: String -> Bool
+isAbsolutePath ('.':_) = False
+isAbsolutePath s = ':' `elem` s
+
+normalizeDots :: String -> String
+normalizeDots path@('.':_) =
+ case dropWhile (== '.') path of
+ ':':_ -> path
+ _ -> takeWhile (== '.') path ++ ':':dropWhile (== '.') path
+normalizeDots path = path
+
+normalizeInternalPath :: String -> String
+normalizeInternalPath path =
+ if isAbsolutePath path
+ then ensureAbsolute normalizedPath
+ else normalizedPath
+ where
+ normalizedPath = intercalate "/" $ dropWhile (== ".") $ splitOn ":" $ normalizeDots path
+ ensureAbsolute s@('/':_) = s
+ ensureAbsolute s = '/':s
+
+normalizePath :: String -> String
+normalizePath path =
+ if isExternalLink path
+ then path
+ else normalizeInternalPath path
+
+urlToText :: String -> String
+urlToText url =
+ if isExternalLink url
+ then url
+ else reverse $ takeWhile (/= ':') $ reverse url
+
+-- Parse link or image
+parseLink :: PandocMonad m
+ => (String -> Maybe B.Inlines -> B.Inlines)
+ -> String
+ -> String
+ -> DWParser m B.Inlines
+parseLink f l r = f
+ <$ string l
+ <*> many1Till anyChar (lookAhead (void (char '|') <|> try (void $ string r)))
+ <*> optionMaybe (B.trimInlines . mconcat <$> (char '|' *> manyTill inline (try $ lookAhead $ string r)))
+ <* string r
+
+-- | Split Interwiki link into left and right part
+-- | Return Nothing if it is not Interwiki link
+splitInterwiki :: String -> Maybe (String, String)
+splitInterwiki path =
+ case span (\c -> isAlphaNum c || c == '.') path of
+ (l, '>':r) -> Just (l, r)
+ _ -> Nothing
+
+interwikiToUrl :: String -> String -> String
+interwikiToUrl "callto" page = "callto://" ++ page
+interwikiToUrl "doku" page = "https://www.dokuwiki.org/" ++ page
+interwikiToUrl "phpfn" page = "https://secure.php.net/" ++ page
+interwikiToUrl "tel" page = "tel:" ++ page
+interwikiToUrl "wp" page = "https://en.wikipedia.org/wiki/" ++ page
+interwikiToUrl "wpde" page = "https://de.wikipedia.org/wiki/" ++ page
+interwikiToUrl "wpes" page = "https://es.wikipedia.org/wiki/" ++ page
+interwikiToUrl "wpfr" page = "https://fr.wikipedia.org/wiki/" ++ page
+interwikiToUrl "wpjp" page = "https://jp.wikipedia.org/wiki/" ++ page
+interwikiToUrl "wppl" page = "https://pl.wikipedia.org/wiki/" ++ page
+interwikiToUrl _ page = "https://www.google.com/search?q=" ++ page ++ "&btnI=lucky"
+
+linkText :: PandocMonad m => DWParser m B.Inlines
+linkText = parseLink fromRaw "[[" "]]"
+ where
+ fromRaw path description =
+ B.link normalizedPath "" (fromMaybe (B.str defaultDescription) description)
+ where
+ path' = trim path
+ interwiki = splitInterwiki path'
+ normalizedPath =
+ case interwiki of
+ Nothing -> normalizePath path'
+ Just (l, r) -> interwikiToUrl l r
+ defaultDescription =
+ case interwiki of
+ Nothing -> urlToText path'
+ Just (_, r) -> r
+
+-- Matches strings like "100x100" (width x height) and "50" (width)
+isWidthHeightParameter :: String -> Bool
+isWidthHeightParameter s =
+ case s of
+ (x:xs) ->
+ isDigit x && case dropWhile isDigit xs of
+ ('x':ys@(_:_)) -> all isDigit ys
+ "" -> True
+ _ -> False
+ _ -> False
+
+parseWidthHeight :: String -> (Maybe String, Maybe String)
+parseWidthHeight s = (width, height)
+ where
+ width = Just $ takeWhile isDigit s
+ height =
+ case dropWhile isDigit s of
+ ('x':xs) -> Just xs
+ _ -> Nothing
+
+image :: PandocMonad m => DWParser m B.Inlines
+image = parseLink fromRaw "{{" "}}"
+ where
+ fromRaw path description =
+ if linkOnly
+ then B.link normalizedPath "" (fromMaybe defaultDescription description)
+ else B.imageWith ("", classes, attributes) normalizedPath "" (fromMaybe defaultDescription description)
+ where
+ (path', parameters) = span (/= '?') $ trim path
+ normalizedPath = normalizePath path'
+ leftPadding = " " `isPrefixOf` path
+ rightPadding = " " `isSuffixOf` path
+ classes =
+ case (leftPadding, rightPadding) of
+ (False, False) -> []
+ (False, True) -> ["align-left"]
+ (True, False) -> ["align-right"]
+ (True, True) -> ["align-center"]
+ parameterList = splitOn "&" $ drop 1 parameters
+ linkOnly = "linkonly" `elem` parameterList
+ (width, height) = maybe (Nothing, Nothing) parseWidthHeight (F.find isWidthHeightParameter parameterList)
+ attributes = catMaybes [fmap ("width",) width, fmap ("height",) height]
+ defaultDescription = B.str $ urlToText path'
+
+-- * Block parsers
+
+block :: PandocMonad m => DWParser m B.Blocks
+block = do
+ res <- mempty <$ skipMany1 blankline
+ <|> blockElements
+ <|> para
+ skipMany blankline
+ trace (take 60 $ show $ B.toList res)
+ return res
+
+blockElements :: PandocMonad m => DWParser m B.Blocks
+blockElements = horizontalLine
+ <|> header
+ <|> list " "
+ <|> indentedCode
+ <|> quote
+ <|> blockCode
+ <|> blockFile
+ <|> blockHtml
+ <|> blockPhp
+ <|> table
+
+horizontalLine :: PandocMonad m => DWParser m B.Blocks
+horizontalLine = try $ B.horizontalRule <$ string "---" <* many1 (char '-') <* eol
+
+header :: PandocMonad m => DWParser m B.Blocks
+header = try $ do
+ guardColumnOne
+ eqs <- many1 (char '=')
+ let lev = length eqs
+ guard $ lev < 7
+ contents <- B.trimInlines . mconcat <$> manyTill inline (try $ char '=' *> many1 (char '='))
+ attr <- registerHeader nullAttr contents
+ return $ B.headerWith attr (7 - lev) contents
+
+list :: PandocMonad m => String -> DWParser m B.Blocks
+list prefix = bulletList prefix <|> orderedList prefix
+
+bulletList :: PandocMonad m => String -> DWParser m B.Blocks
+bulletList prefix = try $ B.bulletList <$> parseList prefix '*'
+
+orderedList :: PandocMonad m => String -> DWParser m B.Blocks
+orderedList prefix = try $ B.orderedList <$> parseList prefix '-'
+
+parseList :: PandocMonad m
+ => String
+ -> Char
+ -> DWParser m [B.Blocks]
+parseList prefix marker =
+ many1 ((<>) <$> item <*> fmap mconcat (many continuation))
+ where
+ continuation = try $ list (" " ++ prefix)
+ item = try $ string prefix *> char marker *> char ' ' *> itemContents
+ itemContents = B.plain . mconcat <$> many1Till inline' eol
+
+indentedCode :: PandocMonad m => DWParser m B.Blocks
+indentedCode = try $ B.codeBlock . unlines <$> many1 indentedLine
+ where
+ indentedLine = try $ string " " *> manyTill anyChar eol
+
+quote :: PandocMonad m => DWParser m B.Blocks
+quote = try $ nestedQuote 0
+ where
+ prefix level = count level (char '>')
+ contents level = nestedQuote level <|> quoteLine
+ quoteLine = try $ B.plain . B.trimInlines . mconcat <$> many1Till inline' eol
+ quoteContents level = (<>) <$> contents level <*> quoteContinuation level
+ quoteContinuation level = mconcat <$> many (try $ prefix level *> contents level)
+ nestedQuote level = B.blockQuote <$ char '>' <*> quoteContents (level + 1 :: Int)
+
+blockHtml :: PandocMonad m => DWParser m B.Blocks
+blockHtml = try $ B.rawBlock "html"
+ <$ string "<HTML>"
+ <* optional (manyTill spaceChar eol)
+ <*> manyTill anyChar (try $ string "</HTML>")
+
+blockPhp :: PandocMonad m => DWParser m B.Blocks
+blockPhp = try $ B.codeBlockWith ("", ["php"], [])
+ <$ string "<PHP>"
+ <* optional (manyTill spaceChar eol)
+ <*> manyTill anyChar (try $ string "</PHP>")
+
+table :: PandocMonad m => DWParser m B.Blocks
+table = do
+ firstSeparator <- lookAhead tableCellSeparator
+ rows <- tableRows
+ let (headerRow, body) = if firstSeparator == '^'
+ then (head rows, tail rows)
+ else ([], rows)
+ let attrs = const (AlignDefault, 0.0) <$> transpose rows
+ pure $ B.table mempty attrs headerRow body
+
+tableRows :: PandocMonad m => DWParser m [[B.Blocks]]
+tableRows = many1 tableRow
+
+tableRow :: PandocMonad m => DWParser m [B.Blocks]
+tableRow = many1Till tableCell tableRowEnd
+
+tableRowEnd :: PandocMonad m => DWParser m Char
+tableRowEnd = try $ tableCellSeparator <* manyTill spaceChar eol
+
+tableCellSeparator :: PandocMonad m => DWParser m Char
+tableCellSeparator = char '|' <|> char '^'
+
+tableCell :: PandocMonad m => DWParser m B.Blocks
+tableCell = try $ B.plain . B.trimInlines . mconcat <$> (normalCell <|> headerCell)
+ where
+ normalCell = char '|' *> manyTill inline' (lookAhead tableCellSeparator)
+ headerCell = char '^' *> manyTill inline' (lookAhead tableCellSeparator)
+
+blockCode :: PandocMonad m => DWParser m B.Blocks
+blockCode = codeTag B.codeBlockWith "code"
+
+blockFile :: PandocMonad m => DWParser m B.Blocks
+blockFile = codeTag B.codeBlockWith "file"
+
+para :: PandocMonad m => DWParser m B.Blocks
+para = result . mconcat <$> many1Till inline endOfParaElement
+ where
+ endOfParaElement = lookAhead $ endOfInput <|> endOfPara <|> newBlockElement
+ endOfInput = try $ skipMany blankline >> skipSpaces >> eof
+ endOfPara = try $ blankline >> skipMany1 blankline
+ newBlockElement = try $ blankline >> void blockElements
+ result content = if F.all (==Space) content
+ then mempty
+ else B.para $ B.trimInlines content