diff options
author | Alexander <ilabdsf@gmail.com> | 2019-01-06 23:06:32 +0000 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2019-01-06 15:06:32 -0800 |
commit | 40c30a9d88101e2a00b904597c6ad4d8f8bce60b (patch) | |
tree | e18e2a5474c0c97b5ca8b40915fa92ac1beae985 /src | |
parent | a5f5002eefd43820b1f54191b0cf3cbd743eca0a (diff) | |
download | pandoc-40c30a9d88101e2a00b904597c6ad4d8f8bce60b.tar.gz |
Add DokuWiki reader (#5108)
Closes #1792
Diffstat (limited to 'src')
-rw-r--r-- | src/Text/Pandoc/Readers.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/DokuWiki.hs | 529 |
2 files changed, 532 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index 630b8b858..434725887 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -47,6 +47,7 @@ module Text.Pandoc.Readers , readMarkdown , readCommonMark , readCreole + , readDokuWiki , readMediaWiki , readVimwiki , readRST @@ -86,6 +87,7 @@ import Text.Pandoc.Readers.CommonMark import Text.Pandoc.Readers.Creole import Text.Pandoc.Readers.DocBook import Text.Pandoc.Readers.Docx +import Text.Pandoc.Readers.DokuWiki import Text.Pandoc.Readers.EPUB import Text.Pandoc.Readers.FB2 import Text.Pandoc.Readers.Haddock @@ -123,6 +125,7 @@ readers = [ ("native" , TextReader readNative) ,("markdown_mmd", TextReader readMarkdown) ,("commonmark" , TextReader readCommonMark) ,("creole" , TextReader readCreole) + ,("dokuwiki" , TextReader readDokuWiki) ,("gfm" , TextReader readCommonMark) ,("rst" , TextReader readRST) ,("mediawiki" , TextReader readMediaWiki) diff --git a/src/Text/Pandoc/Readers/DokuWiki.hs b/src/Text/Pandoc/Readers/DokuWiki.hs new file mode 100644 index 000000000..b9ef96675 --- /dev/null +++ b/src/Text/Pandoc/Readers/DokuWiki.hs @@ -0,0 +1,529 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TupleSections #-} + +{- + Copyright (C) 2018-2019 Alexander Krotov <ilabdsf@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.DokuWiki + Copyright : Copyright (C) 2018 Alexander Krotov + License : GNU GPL, version 2 or above + + Maintainer : Alexander Krotov <ilabdsf@gmail.com> + Stability : alpha + Portability : portable + +Conversion of DokuWiki text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.DokuWiki (readDokuWiki) where + +import Prelude +import Control.Monad +import Control.Monad.Except (throwError) +import Data.Char (isAlphaNum, isDigit) +import qualified Data.Foldable as F +import Data.List (intercalate, transpose, isPrefixOf, isSuffixOf) +import Data.List.Split (splitOn) +import Data.Maybe (fromMaybe, catMaybes) +import Data.Text (Text) +import qualified Data.Text as T +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad (..)) +import Text.Pandoc.Definition +import Text.Pandoc.Error (PandocError (PandocParsecError)) +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (enclosed, nested) +import Text.Pandoc.Shared (crFilter, trim, underlineSpan) + +-- | Read DokuWiki from an input string and return a Pandoc document. +readDokuWiki :: PandocMonad m + => ReaderOptions + -> Text + -> m Pandoc +readDokuWiki opts s = do + let input = crFilter s + res <- runParserT parseDokuWiki def {stateOptions = opts } "source" input + case res of + Left e -> throwError $ PandocParsecError (T.unpack input) e + Right d -> return d + +type DWParser = ParserT Text ParserState + +-- * Utility functions + +-- | Parse end-of-line, which can be either a newline or end-of-file. +eol :: Stream s m Char => ParserT s st m () +eol = void newline <|> eof + +nested :: PandocMonad m => DWParser m a -> DWParser m a +nested p = do + nestlevel <- stateMaxNestingLevel <$> getState + guard $ nestlevel > 0 + updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } + res <- p + updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } + return res + +guardColumnOne :: PandocMonad m => DWParser m () +guardColumnOne = getPosition >>= \pos -> guard (sourceColumn pos == 1) + +-- | Parse DokuWiki document. +parseDokuWiki :: PandocMonad m => DWParser m Pandoc +parseDokuWiki = + B.doc . mconcat <$> many block <* spaces <* eof + +-- | Parse <code> and <file> attributes +codeLanguage :: PandocMonad m => DWParser m (String, [String], [(String, String)]) +codeLanguage = try $ do + rawLang <- option "-" (spaceChar *> manyTill anyChar (lookAhead (spaceChar <|> char '>'))) + let attr = case rawLang of + "-" -> [] + l -> [l] + return ("", attr, []) + +-- | Generic parser for <code> and <file> tags +codeTag :: PandocMonad m + => ((String, [String], [(String, String)]) -> String -> a) + -> String + -> DWParser m a +codeTag f tag = try $ f + <$ char '<' + <* string tag + <*> codeLanguage + <* manyTill anyChar (char '>') + <* optional (manyTill spaceChar eol) + <*> manyTill anyChar (try $ string "</" <* string tag <* char '>') + +-- * Inline parsers + +-- | Parse any inline element but softbreak. +inline' :: PandocMonad m => DWParser m B.Inlines +inline' = whitespace + <|> br + <|> bold + <|> italic + <|> underlined + <|> nowiki + <|> percent + <|> link + <|> image + <|> monospaced + <|> subscript + <|> superscript + <|> deleted + <|> footnote + <|> inlineCode + <|> inlineFile + <|> inlineHtml + <|> inlinePhp + <|> autoLink + <|> autoEmail + <|> notoc + <|> nocache + <|> str + <|> symbol + <?> "inline" + +-- | Parse any inline element, including soft break. +inline :: PandocMonad m => DWParser m B.Inlines +inline = endline <|> inline' + +endline :: PandocMonad m => DWParser m B.Inlines +endline = try $ B.softbreak <$ skipMany spaceChar <* linebreak + +whitespace :: PandocMonad m => DWParser m B.Inlines +whitespace = try $ B.space <$ skipMany1 spaceChar + +br :: PandocMonad m => DWParser m B.Inlines +br = try $ B.linebreak <$ string "\\\\" <* space + +linebreak :: PandocMonad m => DWParser m B.Inlines +linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline) + where lastNewline = mempty <$ eof + innerNewline = pure B.space + +between :: (Monoid c, PandocMonad m, Show b) + => DWParser m a -> DWParser m b -> (DWParser m b -> DWParser m c) + -> DWParser m c +between start end p = + mconcat <$> try (start >> notFollowedBy whitespace >> many1Till (p end) end) + +enclosed :: (Monoid b, PandocMonad m, Show a) + => DWParser m a -> (DWParser m a -> DWParser m b) -> DWParser m b +enclosed sep p = between sep (try sep) p + +nestedInlines :: (Show a, PandocMonad m) + => DWParser m a -> DWParser m B.Inlines +nestedInlines end = innerSpace <|> nestedInline + where + innerSpace = try $ whitespace <* notFollowedBy end + nestedInline = notFollowedBy whitespace >> nested inline + +bold :: PandocMonad m => DWParser m B.Inlines +bold = try $ B.strong <$> enclosed (string "**") nestedInlines + +italic :: PandocMonad m => DWParser m B.Inlines +italic = try $ B.emph <$> enclosed (string "//") nestedInlines + +underlined :: PandocMonad m => DWParser m B.Inlines +underlined = try $ underlineSpan <$> enclosed (string "__") nestedInlines + +nowiki :: PandocMonad m => DWParser m B.Inlines +nowiki = try $ B.text <$ string "<nowiki>" <*> manyTill anyChar (try $ string "</nowiki>") + +percent :: PandocMonad m => DWParser m B.Inlines +percent = try $ B.text <$> enclosed (string "%%") nestedString + +nestedString :: (Show a, PandocMonad m) + => DWParser m a -> DWParser m String +nestedString end = innerSpace <|> count 1 nonspaceChar + where + innerSpace = try $ many1 spaceChar <* notFollowedBy end + +monospaced :: PandocMonad m => DWParser m B.Inlines +monospaced = try $ B.code <$> enclosed (string "''") nestedString + +subscript :: PandocMonad m => DWParser m B.Inlines +subscript = try $ B.subscript <$> between (string "<sub>") (try $ string "</sub>") nestedInlines + +superscript :: PandocMonad m => DWParser m B.Inlines +superscript = try $ B.superscript <$> between (string "<sup>") (try $ string "</sup>") nestedInlines + +deleted :: PandocMonad m => DWParser m B.Inlines +deleted = try $ B.strikeout <$> between (string "<del>") (try $ string "</del>") nestedInlines + +-- | Parse a footnote. +footnote :: PandocMonad m => DWParser m B.Inlines +footnote = try $ B.note . B.para <$> between (string "((") (try $ string "))") nestedInlines + +inlineCode :: PandocMonad m => DWParser m B.Inlines +inlineCode = codeTag B.codeWith "code" + +inlineFile :: PandocMonad m => DWParser m B.Inlines +inlineFile = codeTag B.codeWith "file" + +inlineHtml :: PandocMonad m => DWParser m B.Inlines +inlineHtml = try $ B.rawInline "html" <$ string "<html>" <*> manyTill anyChar (try $ string "</html>") + +inlinePhp :: PandocMonad m => DWParser m B.Inlines +inlinePhp = try $ B.codeWith ("", ["php"], []) <$ string "<php>" <*> manyTill anyChar (try $ string "</php>") + +makeLink :: (String, String) -> B.Inlines +makeLink (text, url) = B.link url "" $ B.str text + +autoEmail :: PandocMonad m => DWParser m B.Inlines +autoEmail = try $ do + state <- getState + guard $ stateAllowLinks state + makeLink <$ char '<' <*> emailAddress <* char '>' + +autoLink :: PandocMonad m => DWParser m B.Inlines +autoLink = try $ do + state <- getState + guard $ stateAllowLinks state + (text, url) <- uri + guard $ checkLink (last url) + return $ makeLink (text, url) + where + checkLink c + | c == '/' = True + | otherwise = isAlphaNum c + +notoc :: PandocMonad m => DWParser m B.Inlines +notoc = try $ mempty <$ string "~~NOTOC~~" + +nocache :: PandocMonad m => DWParser m B.Inlines +nocache = try $ mempty <$ string "~~NOCACHE~~" + +str :: PandocMonad m => DWParser m B.Inlines +str = B.str <$> (many1 alphaNum <|> count 1 characterReference) + +symbol :: PandocMonad m => DWParser m B.Inlines +symbol = B.str <$> count 1 nonspaceChar + +link :: PandocMonad m => DWParser m B.Inlines +link = try $ do + st <- getState + guard $ stateAllowLinks st + setState $ st{ stateAllowLinks = False } + l <- linkText + setState $ st{ stateAllowLinks = True } + return l + +isExternalLink :: String -> Bool +isExternalLink s = + case dropWhile (\c -> isAlphaNum c || (c `elem` ['-', '.', '+'])) s of + (':':'/':'/':_) -> True + _ -> False + +isAbsolutePath :: String -> Bool +isAbsolutePath ('.':_) = False +isAbsolutePath s = ':' `elem` s + +normalizeDots :: String -> String +normalizeDots path@('.':_) = + case dropWhile (== '.') path of + ':':_ -> path + _ -> takeWhile (== '.') path ++ ':':dropWhile (== '.') path +normalizeDots path = path + +normalizeInternalPath :: String -> String +normalizeInternalPath path = + if isAbsolutePath path + then ensureAbsolute normalizedPath + else normalizedPath + where + normalizedPath = intercalate "/" $ dropWhile (== ".") $ splitOn ":" $ normalizeDots path + ensureAbsolute s@('/':_) = s + ensureAbsolute s = '/':s + +normalizePath :: String -> String +normalizePath path = + if isExternalLink path + then path + else normalizeInternalPath path + +urlToText :: String -> String +urlToText url = + if isExternalLink url + then url + else reverse $ takeWhile (/= ':') $ reverse url + +-- Parse link or image +parseLink :: PandocMonad m + => (String -> Maybe B.Inlines -> B.Inlines) + -> String + -> String + -> DWParser m B.Inlines +parseLink f l r = f + <$ string l + <*> many1Till anyChar (lookAhead (void (char '|') <|> try (void $ string r))) + <*> optionMaybe (B.trimInlines . mconcat <$> (char '|' *> manyTill inline (try $ lookAhead $ string r))) + <* string r + +-- | Split Interwiki link into left and right part +-- | Return Nothing if it is not Interwiki link +splitInterwiki :: String -> Maybe (String, String) +splitInterwiki path = + case span (\c -> isAlphaNum c || c == '.') path of + (l, '>':r) -> Just (l, r) + _ -> Nothing + +interwikiToUrl :: String -> String -> String +interwikiToUrl "callto" page = "callto://" ++ page +interwikiToUrl "doku" page = "https://www.dokuwiki.org/" ++ page +interwikiToUrl "phpfn" page = "https://secure.php.net/" ++ page +interwikiToUrl "tel" page = "tel:" ++ page +interwikiToUrl "wp" page = "https://en.wikipedia.org/wiki/" ++ page +interwikiToUrl "wpde" page = "https://de.wikipedia.org/wiki/" ++ page +interwikiToUrl "wpes" page = "https://es.wikipedia.org/wiki/" ++ page +interwikiToUrl "wpfr" page = "https://fr.wikipedia.org/wiki/" ++ page +interwikiToUrl "wpjp" page = "https://jp.wikipedia.org/wiki/" ++ page +interwikiToUrl "wppl" page = "https://pl.wikipedia.org/wiki/" ++ page +interwikiToUrl _ page = "https://www.google.com/search?q=" ++ page ++ "&btnI=lucky" + +linkText :: PandocMonad m => DWParser m B.Inlines +linkText = parseLink fromRaw "[[" "]]" + where + fromRaw path description = + B.link normalizedPath "" (fromMaybe (B.str defaultDescription) description) + where + path' = trim path + interwiki = splitInterwiki path' + normalizedPath = + case interwiki of + Nothing -> normalizePath path' + Just (l, r) -> interwikiToUrl l r + defaultDescription = + case interwiki of + Nothing -> urlToText path' + Just (_, r) -> r + +-- Matches strings like "100x100" (width x height) and "50" (width) +isWidthHeightParameter :: String -> Bool +isWidthHeightParameter s = + case s of + (x:xs) -> + isDigit x && case dropWhile isDigit xs of + ('x':ys@(_:_)) -> all isDigit ys + "" -> True + _ -> False + _ -> False + +parseWidthHeight :: String -> (Maybe String, Maybe String) +parseWidthHeight s = (width, height) + where + width = Just $ takeWhile isDigit s + height = + case dropWhile isDigit s of + ('x':xs) -> Just xs + _ -> Nothing + +image :: PandocMonad m => DWParser m B.Inlines +image = parseLink fromRaw "{{" "}}" + where + fromRaw path description = + if linkOnly + then B.link normalizedPath "" (fromMaybe defaultDescription description) + else B.imageWith ("", classes, attributes) normalizedPath "" (fromMaybe defaultDescription description) + where + (path', parameters) = span (/= '?') $ trim path + normalizedPath = normalizePath path' + leftPadding = " " `isPrefixOf` path + rightPadding = " " `isSuffixOf` path + classes = + case (leftPadding, rightPadding) of + (False, False) -> [] + (False, True) -> ["align-left"] + (True, False) -> ["align-right"] + (True, True) -> ["align-center"] + parameterList = splitOn "&" $ drop 1 parameters + linkOnly = "linkonly" `elem` parameterList + (width, height) = maybe (Nothing, Nothing) parseWidthHeight (F.find isWidthHeightParameter parameterList) + attributes = catMaybes [fmap ("width",) width, fmap ("height",) height] + defaultDescription = B.str $ urlToText path' + +-- * Block parsers + +block :: PandocMonad m => DWParser m B.Blocks +block = do + res <- mempty <$ skipMany1 blankline + <|> blockElements + <|> para + skipMany blankline + trace (take 60 $ show $ B.toList res) + return res + +blockElements :: PandocMonad m => DWParser m B.Blocks +blockElements = horizontalLine + <|> header + <|> list " " + <|> indentedCode + <|> quote + <|> blockCode + <|> blockFile + <|> blockHtml + <|> blockPhp + <|> table + +horizontalLine :: PandocMonad m => DWParser m B.Blocks +horizontalLine = try $ B.horizontalRule <$ string "---" <* many1 (char '-') <* eol + +header :: PandocMonad m => DWParser m B.Blocks +header = try $ do + guardColumnOne + eqs <- many1 (char '=') + let lev = length eqs + guard $ lev < 7 + contents <- B.trimInlines . mconcat <$> manyTill inline (try $ char '=' *> many1 (char '=')) + attr <- registerHeader nullAttr contents + return $ B.headerWith attr (7 - lev) contents + +list :: PandocMonad m => String -> DWParser m B.Blocks +list prefix = bulletList prefix <|> orderedList prefix + +bulletList :: PandocMonad m => String -> DWParser m B.Blocks +bulletList prefix = try $ B.bulletList <$> parseList prefix '*' + +orderedList :: PandocMonad m => String -> DWParser m B.Blocks +orderedList prefix = try $ B.orderedList <$> parseList prefix '-' + +parseList :: PandocMonad m + => String + -> Char + -> DWParser m [B.Blocks] +parseList prefix marker = + many1 ((<>) <$> item <*> fmap mconcat (many continuation)) + where + continuation = try $ list (" " ++ prefix) + item = try $ string prefix *> char marker *> char ' ' *> itemContents + itemContents = B.plain . mconcat <$> many1Till inline' eol + +indentedCode :: PandocMonad m => DWParser m B.Blocks +indentedCode = try $ B.codeBlock . unlines <$> many1 indentedLine + where + indentedLine = try $ string " " *> manyTill anyChar eol + +quote :: PandocMonad m => DWParser m B.Blocks +quote = try $ nestedQuote 0 + where + prefix level = count level (char '>') + contents level = nestedQuote level <|> quoteLine + quoteLine = try $ B.plain . B.trimInlines . mconcat <$> many1Till inline' eol + quoteContents level = (<>) <$> contents level <*> quoteContinuation level + quoteContinuation level = mconcat <$> many (try $ prefix level *> contents level) + nestedQuote level = B.blockQuote <$ char '>' <*> quoteContents (level + 1 :: Int) + +blockHtml :: PandocMonad m => DWParser m B.Blocks +blockHtml = try $ B.rawBlock "html" + <$ string "<HTML>" + <* optional (manyTill spaceChar eol) + <*> manyTill anyChar (try $ string "</HTML>") + +blockPhp :: PandocMonad m => DWParser m B.Blocks +blockPhp = try $ B.codeBlockWith ("", ["php"], []) + <$ string "<PHP>" + <* optional (manyTill spaceChar eol) + <*> manyTill anyChar (try $ string "</PHP>") + +table :: PandocMonad m => DWParser m B.Blocks +table = do + firstSeparator <- lookAhead tableCellSeparator + rows <- tableRows + let (headerRow, body) = if firstSeparator == '^' + then (head rows, tail rows) + else ([], rows) + let attrs = const (AlignDefault, 0.0) <$> transpose rows + pure $ B.table mempty attrs headerRow body + +tableRows :: PandocMonad m => DWParser m [[B.Blocks]] +tableRows = many1 tableRow + +tableRow :: PandocMonad m => DWParser m [B.Blocks] +tableRow = many1Till tableCell tableRowEnd + +tableRowEnd :: PandocMonad m => DWParser m Char +tableRowEnd = try $ tableCellSeparator <* manyTill spaceChar eol + +tableCellSeparator :: PandocMonad m => DWParser m Char +tableCellSeparator = char '|' <|> char '^' + +tableCell :: PandocMonad m => DWParser m B.Blocks +tableCell = try $ B.plain . B.trimInlines . mconcat <$> (normalCell <|> headerCell) + where + normalCell = char '|' *> manyTill inline' (lookAhead tableCellSeparator) + headerCell = char '^' *> manyTill inline' (lookAhead tableCellSeparator) + +blockCode :: PandocMonad m => DWParser m B.Blocks +blockCode = codeTag B.codeBlockWith "code" + +blockFile :: PandocMonad m => DWParser m B.Blocks +blockFile = codeTag B.codeBlockWith "file" + +para :: PandocMonad m => DWParser m B.Blocks +para = result . mconcat <$> many1Till inline endOfParaElement + where + endOfParaElement = lookAhead $ endOfInput <|> endOfPara <|> newBlockElement + endOfInput = try $ skipMany blankline >> skipSpaces >> eof + endOfPara = try $ blankline >> skipMany1 blankline + newBlockElement = try $ blankline >> void blockElements + result content = if F.all (==Space) content + then mempty + else B.para $ B.trimInlines content |