{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.DokuWiki Copyright : Copyright (C) 2018-2020 Alexander Krotov License : GNU GPL, version 2 or above Maintainer : Alexander Krotov Stability : alpha Portability : portable Conversion of DokuWiki text to 'Pandoc' document. -} module Text.Pandoc.Readers.DokuWiki (readDokuWiki) where import Prelude import Control.Monad import Control.Monad.Except (throwError) import Data.Char (isAlphaNum, isDigit) import qualified Data.Foldable as F import Data.List (transpose) import Data.Maybe (fromMaybe, catMaybes) import Data.Text (Text) import qualified Data.Text as T import qualified Text.Pandoc.Builder as B import Text.Pandoc.Class (PandocMonad (..)) import Text.Pandoc.Definition import Text.Pandoc.Error (PandocError (PandocParsecError)) import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (enclosed, nested) import Text.Pandoc.Shared (crFilter, trim, underlineSpan, stringify, tshow) -- | Read DokuWiki from an input string and return a Pandoc document. readDokuWiki :: PandocMonad m => ReaderOptions -> Text -> m Pandoc readDokuWiki opts s = do let input = crFilter s res <- runParserT parseDokuWiki def {stateOptions = opts } "source" input case res of Left e -> throwError $ PandocParsecError input e Right d -> return d type DWParser = ParserT Text ParserState -- * Utility functions -- | Parse end-of-line, which can be either a newline or end-of-file. eol :: Stream s m Char => ParserT s st m () eol = void newline <|> eof nested :: PandocMonad m => DWParser m a -> DWParser m a nested p = do nestlevel <- stateMaxNestingLevel <$> getState guard $ nestlevel > 0 updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } res <- p updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } return res guardColumnOne :: PandocMonad m => DWParser m () guardColumnOne = getPosition >>= \pos -> guard (sourceColumn pos == 1) -- | Parse DokuWiki document. parseDokuWiki :: PandocMonad m => DWParser m Pandoc parseDokuWiki = B.doc . mconcat <$> many block <* spaces <* eof -- | Parse and attributes codeLanguage :: PandocMonad m => DWParser m (Text, [Text], [(Text, Text)]) codeLanguage = try $ do rawLang <- option "-" (spaceChar *> manyTillChar anyChar (lookAhead (spaceChar <|> char '>'))) let attr = case rawLang of "-" -> [] l -> [l] return ("", attr, []) -- | Generic parser for and tags codeTag :: PandocMonad m => ((Text, [Text], [(Text, Text)]) -> Text -> a) -> Text -> DWParser m a codeTag f tag = try $ f <$ char '<' <* textStr tag <*> codeLanguage <* manyTill anyChar (char '>') <* optional (manyTill spaceChar eol) <*> manyTillChar anyChar (try $ string "') -- * Inline parsers -- | Parse any inline element but softbreak. inline' :: PandocMonad m => DWParser m B.Inlines inline' = whitespace <|> br <|> bold <|> italic <|> underlined <|> nowiki <|> percent <|> link <|> image <|> monospaced <|> subscript <|> superscript <|> deleted <|> footnote <|> inlineCode <|> inlineFile <|> inlineHtml <|> inlinePhp <|> autoLink <|> autoEmail <|> notoc <|> nocache <|> str <|> symbol "inline" -- | Parse any inline element, including soft break. inline :: PandocMonad m => DWParser m B.Inlines inline = endline <|> inline' endline :: PandocMonad m => DWParser m B.Inlines endline = try $ B.softbreak <$ skipMany spaceChar <* linebreak whitespace :: PandocMonad m => DWParser m B.Inlines whitespace = try $ B.space <$ skipMany1 spaceChar br :: PandocMonad m => DWParser m B.Inlines br = try $ B.linebreak <$ string "\\\\" <* space linebreak :: PandocMonad m => DWParser m B.Inlines linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline) where lastNewline = mempty <$ eof innerNewline = pure B.space between :: (Monoid c, PandocMonad m, Show b) => DWParser m a -> DWParser m b -> (DWParser m b -> DWParser m c) -> DWParser m c between start end p = mconcat <$> try (start >> notFollowedBy whitespace >> many1Till (p end) end) enclosed :: (Monoid b, PandocMonad m, Show a) => DWParser m a -> (DWParser m a -> DWParser m b) -> DWParser m b enclosed sep p = between sep (try sep) p nestedInlines :: (Show a, PandocMonad m) => DWParser m a -> DWParser m B.Inlines nestedInlines end = innerSpace <|> nestedInline where innerSpace = try $ whitespace <* notFollowedBy end nestedInline = notFollowedBy whitespace >> nested inline bold :: PandocMonad m => DWParser m B.Inlines bold = try $ B.strong <$> enclosed (string "**") nestedInlines italic :: PandocMonad m => DWParser m B.Inlines italic = try $ B.emph <$> enclosed (string "//") nestedInlines underlined :: PandocMonad m => DWParser m B.Inlines underlined = try $ underlineSpan <$> enclosed (string "__") nestedInlines nowiki :: PandocMonad m => DWParser m B.Inlines nowiki = try $ B.text <$ string "" <*> manyTillChar anyChar (try $ string "") percent :: PandocMonad m => DWParser m B.Inlines percent = try $ B.text <$> enclosed (string "%%") nestedText nestedText :: (Show a, PandocMonad m) => DWParser m a -> DWParser m Text nestedText end = innerSpace <|> countChar 1 nonspaceChar where innerSpace = try $ many1Char spaceChar <* notFollowedBy end monospaced :: PandocMonad m => DWParser m B.Inlines monospaced = try $ B.code . (T.concat . map stringify . B.toList) <$> enclosed (string "''") nestedInlines subscript :: PandocMonad m => DWParser m B.Inlines subscript = try $ B.subscript <$> between (string "") (try $ string "") nestedInlines superscript :: PandocMonad m => DWParser m B.Inlines superscript = try $ B.superscript <$> between (string "") (try $ string "") nestedInlines deleted :: PandocMonad m => DWParser m B.Inlines deleted = try $ B.strikeout <$> between (string "") (try $ string "") nestedInlines -- | Parse a footnote. footnote :: PandocMonad m => DWParser m B.Inlines footnote = try $ B.note . B.para <$> between (string "((") (try $ string "))") nestedInlines inlineCode :: PandocMonad m => DWParser m B.Inlines inlineCode = codeTag B.codeWith "code" inlineFile :: PandocMonad m => DWParser m B.Inlines inlineFile = codeTag B.codeWith "file" inlineHtml :: PandocMonad m => DWParser m B.Inlines inlineHtml = try $ B.rawInline "html" <$ string "" <*> manyTillChar anyChar (try $ string "") inlinePhp :: PandocMonad m => DWParser m B.Inlines inlinePhp = try $ B.codeWith ("", ["php"], []) <$ string "" <*> manyTillChar anyChar (try $ string "") makeLink :: (Text, Text) -> B.Inlines makeLink (text, url) = B.link url "" $ B.str text autoEmail :: PandocMonad m => DWParser m B.Inlines autoEmail = try $ do state <- getState guard $ stateAllowLinks state makeLink <$ char '<' <*> emailAddress <* char '>' autoLink :: PandocMonad m => DWParser m B.Inlines autoLink = try $ do state <- getState guard $ stateAllowLinks state (text, url) <- uri guard $ checkLink (T.last url) return $ makeLink (text, url) where checkLink c | c == '/' = True | otherwise = isAlphaNum c notoc :: PandocMonad m => DWParser m B.Inlines notoc = try $ mempty <$ string "~~NOTOC~~" nocache :: PandocMonad m => DWParser m B.Inlines nocache = try $ mempty <$ string "~~NOCACHE~~" str :: PandocMonad m => DWParser m B.Inlines str = B.str <$> (many1Char alphaNum <|> countChar 1 characterReference) symbol :: PandocMonad m => DWParser m B.Inlines symbol = B.str <$> countChar 1 nonspaceChar link :: PandocMonad m => DWParser m B.Inlines link = try $ do st <- getState guard $ stateAllowLinks st setState $ st{ stateAllowLinks = False } l <- linkText setState $ st{ stateAllowLinks = True } return l isExternalLink :: Text -> Bool isExternalLink s = "://" `T.isPrefixOf` sSuff where sSuff = T.dropWhile (\c -> isAlphaNum c || (c `elem` ['-', '.', '+'])) s isAbsolutePath :: Text -> Bool isAbsolutePath (T.uncons -> Just ('.', _)) = False isAbsolutePath s = T.any (== ':') s normalizeDots :: Text -> Text normalizeDots path | not (T.null pref) = case T.uncons suff of Just (':', _) -> path _ -> pref <> ":" <> suff | otherwise = path where (pref, suff) = T.span (== '.') path normalizeInternalPath :: Text -> Text normalizeInternalPath path = if isAbsolutePath path then ensureAbsolute normalizedPath else normalizedPath where normalizedPath = T.intercalate "/" $ dropWhile (== ".") $ T.splitOn ":" $ normalizeDots path ensureAbsolute s@(T.uncons -> Just ('/', _)) = s ensureAbsolute s = "/" <> s normalizePath :: Text -> Text normalizePath path = if isExternalLink path then path else normalizeInternalPath path urlToText :: Text -> Text urlToText url = if isExternalLink url then url else T.takeWhileEnd (/= ':') url -- Parse link or image parseLink :: PandocMonad m => (Text -> Maybe B.Inlines -> B.Inlines) -> Text -> Text -> DWParser m B.Inlines parseLink f l r = f <$ textStr l <*> many1TillChar anyChar (lookAhead (void (char '|') <|> try (void $ textStr r))) <*> optionMaybe (B.trimInlines . mconcat <$> (char '|' *> manyTill inline (try $ lookAhead $ textStr r))) <* textStr r -- | Split Interwiki link into left and right part -- | Return Nothing if it is not Interwiki link splitInterwiki :: Text -> Maybe (Text, Text) splitInterwiki path = case T.span (\c -> isAlphaNum c || c == '.') path of (l, T.uncons -> Just ('>', r)) -> Just (l, r) _ -> Nothing interwikiToUrl :: Text -> Text -> Text interwikiToUrl "callto" page = "callto://" <> page interwikiToUrl "doku" page = "https://www.dokuwiki.org/" <> page interwikiToUrl "phpfn" page = "https://secure.php.net/" <> page interwikiToUrl "tel" page = "tel:" <> page interwikiToUrl "wp" page = "https://en.wikipedia.org/wiki/" <> page interwikiToUrl "wpde" page = "https://de.wikipedia.org/wiki/" <> page interwikiToUrl "wpes" page = "https://es.wikipedia.org/wiki/" <> page interwikiToUrl "wpfr" page = "https://fr.wikipedia.org/wiki/" <> page interwikiToUrl "wpjp" page = "https://jp.wikipedia.org/wiki/" <> page interwikiToUrl "wppl" page = "https://pl.wikipedia.org/wiki/" <> page interwikiToUrl _ page = "https://www.google.com/search?q=" <> page <> "&btnI=lucky" linkText :: PandocMonad m => DWParser m B.Inlines linkText = parseLink fromRaw "[[" "]]" where fromRaw path description = B.link normalizedPath "" (fromMaybe (B.str defaultDescription) description) where path' = trim path interwiki = splitInterwiki path' normalizedPath = case interwiki of Nothing -> normalizePath path' Just (l, r) -> interwikiToUrl l r defaultDescription = case interwiki of Nothing -> urlToText path' Just (_, r) -> r -- Matches strings like "100x100" (width x height) and "50" (width) isWidthHeightParameter :: Text -> Bool isWidthHeightParameter s = case T.uncons s of Just (x, xs) -> isDigit x && case T.uncons $ T.dropWhile isDigit xs of Just ('x', ys) | not (T.null ys) -> T.all isDigit ys Nothing -> True _ -> False _ -> False parseWidthHeight :: Text -> (Maybe Text, Maybe Text) parseWidthHeight s = (width, height) where width = Just $ T.takeWhile isDigit s height = case T.uncons $ T.dropWhile isDigit s of Just ('x', xs) -> Just xs _ -> Nothing image :: PandocMonad m => DWParser m B.Inlines image = try $ parseLink fromRaw "{{" "}}" where fromRaw path description = if linkOnly then B.link normalizedPath "" (fromMaybe defaultDescription description) else B.imageWith ("", classes, attributes) normalizedPath "" (fromMaybe defaultDescription description) where (path', parameters) = T.span (/= '?') $ trim path normalizedPath = normalizePath path' leftPadding = " " `T.isPrefixOf` path rightPadding = " " `T.isSuffixOf` path classes = case (leftPadding, rightPadding) of (False, False) -> [] (False, True) -> ["align-left"] (True, False) -> ["align-right"] (True, True) -> ["align-center"] parameterList = T.splitOn "&" $ T.drop 1 parameters linkOnly = "linkonly" `elem` parameterList (width, height) = maybe (Nothing, Nothing) parseWidthHeight (F.find isWidthHeightParameter parameterList) attributes = catMaybes [fmap ("width",) width, fmap ("height",) height] defaultDescription = B.str $ urlToText path' -- * Block parsers block :: PandocMonad m => DWParser m B.Blocks block = do res <- mempty <$ skipMany1 blankline <|> blockElements <|> para skipMany blankline trace (T.take 60 $ tshow $ B.toList res) return res blockElements :: PandocMonad m => DWParser m B.Blocks blockElements = horizontalLine <|> header <|> list " " <|> indentedCode <|> quote <|> blockCode <|> blockFile <|> blockHtml <|> blockPhp <|> table horizontalLine :: PandocMonad m => DWParser m B.Blocks horizontalLine = try $ B.horizontalRule <$ string "---" <* many1 (char '-') <* eol header :: PandocMonad m => DWParser m B.Blocks header = try $ do guardColumnOne eqs <- many1 (char '=') let lev = length eqs guard $ lev < 7 contents <- B.trimInlines . mconcat <$> manyTill inline (try $ char '=' *> many1 (char '=')) attr <- registerHeader nullAttr contents return $ B.headerWith attr (7 - lev) contents list :: PandocMonad m => Text -> DWParser m B.Blocks list prefix = bulletList prefix <|> orderedList prefix bulletList :: PandocMonad m => Text -> DWParser m B.Blocks bulletList prefix = try $ B.bulletList <$> parseList prefix '*' orderedList :: PandocMonad m => Text -> DWParser m B.Blocks orderedList prefix = try $ B.orderedList <$> parseList prefix '-' parseList :: PandocMonad m => Text -> Char -> DWParser m [B.Blocks] parseList prefix marker = many1 ((<>) <$> item <*> fmap mconcat (many continuation)) where continuation = try $ list (" " <> prefix) item = try $ textStr prefix *> char marker *> char ' ' *> itemContents itemContents = B.plain . mconcat <$> many1Till inline' eol indentedCode :: PandocMonad m => DWParser m B.Blocks indentedCode = try $ B.codeBlock . T.unlines <$> many1 indentedLine where indentedLine = try $ string " " *> manyTillChar anyChar eol quote :: PandocMonad m => DWParser m B.Blocks quote = try $ nestedQuote 0 where prefix level = count level (char '>') contents level = nestedQuote level <|> quoteLine quoteLine = try $ B.plain . B.trimInlines . mconcat <$> many1Till inline' eol quoteContents level = (<>) <$> contents level <*> quoteContinuation level quoteContinuation level = mconcat <$> many (try $ prefix level *> contents level) nestedQuote level = B.blockQuote <$ char '>' <*> quoteContents (level + 1 :: Int) blockHtml :: PandocMonad m => DWParser m B.Blocks blockHtml = try $ B.rawBlock "html" <$ string "" <* optional (manyTill spaceChar eol) <*> manyTillChar anyChar (try $ string "") blockPhp :: PandocMonad m => DWParser m B.Blocks blockPhp = try $ B.codeBlockWith ("", ["php"], []) <$ string "" <* optional (manyTill spaceChar eol) <*> manyTillChar anyChar (try $ string "") table :: PandocMonad m => DWParser m B.Blocks table = do firstSeparator <- lookAhead tableCellSeparator rows <- tableRows let (headerRow, body) = if firstSeparator == '^' then (head rows, tail rows) else ([], rows) let attrs = (AlignDefault, 0.0) <$ transpose rows pure $ B.table mempty attrs headerRow body tableRows :: PandocMonad m => DWParser m [[B.Blocks]] tableRows = many1 tableRow tableRow :: PandocMonad m => DWParser m [B.Blocks] tableRow = many1Till tableCell tableRowEnd tableRowEnd :: PandocMonad m => DWParser m Char tableRowEnd = try $ tableCellSeparator <* manyTill spaceChar eol tableCellSeparator :: PandocMonad m => DWParser m Char tableCellSeparator = char '|' <|> char '^' tableCell :: PandocMonad m => DWParser m B.Blocks tableCell = try $ B.plain . B.trimInlines . mconcat <$> (normalCell <|> headerCell) where normalCell = char '|' *> manyTill inline' (lookAhead tableCellSeparator) headerCell = char '^' *> manyTill inline' (lookAhead tableCellSeparator) blockCode :: PandocMonad m => DWParser m B.Blocks blockCode = codeTag B.codeBlockWith "code" blockFile :: PandocMonad m => DWParser m B.Blocks blockFile = codeTag B.codeBlockWith "file" para :: PandocMonad m => DWParser m B.Blocks para = result . mconcat <$> many1Till inline endOfParaElement where endOfParaElement = lookAhead $ endOfInput <|> endOfPara <|> newBlockElement endOfInput = try $ skipMany blankline >> skipSpaces >> eof endOfPara = try $ blankline >> skipMany1 blankline newBlockElement = try $ blankline >> void blockElements result content = if F.all (==Space) content then mempty else B.para $ B.trimInlines content