Add DokuWiki reader (#5108)

Closes #1792
author: Alexander <ilabdsf@gmail.com> 2019-01-06 23:06:32 +0000
committer: John MacFarlane <jgm@berkeley.edu> 2019-01-06 15:06:32 -0800
commit: 40c30a9d88101e2a00b904597c6ad4d8f8bce60b (patch)
tree: e18e2a5474c0c97b5ca8b40915fa92ac1beae985 /src/Text/Pandoc
parent: a5f5002eefd43820b1f54191b0cf3cbd743eca0a (diff)
download: pandoc-40c30a9d88101e2a00b904597c6ad4d8f8bce60b.tar.gz
2 files changed, 532 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 630b8b858..434725887 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -47,6 +47,7 @@ module Text.Pandoc.Readers
   , readMarkdown
   , readCommonMark
   , readCreole
+  , readDokuWiki
   , readMediaWiki
   , readVimwiki
   , readRST
@@ -86,6 +87,7 @@ import Text.Pandoc.Readers.CommonMark
 import Text.Pandoc.Readers.Creole
 import Text.Pandoc.Readers.DocBook
 import Text.Pandoc.Readers.Docx
+import Text.Pandoc.Readers.DokuWiki
 import Text.Pandoc.Readers.EPUB
 import Text.Pandoc.Readers.FB2
 import Text.Pandoc.Readers.Haddock
@@ -123,6 +125,7 @@ readers = [ ("native"       , TextReader readNative)
            ,("markdown_mmd",  TextReader readMarkdown)
            ,("commonmark"   , TextReader readCommonMark)
            ,("creole"       , TextReader readCreole)
+           ,("dokuwiki"     , TextReader readDokuWiki)
            ,("gfm"          , TextReader readCommonMark)
            ,("rst"          , TextReader readRST)
            ,("mediawiki"    , TextReader readMediaWiki)
diff --git a/src/Text/Pandoc/Readers/DokuWiki.hs b/src/Text/Pandoc/Readers/DokuWiki.hs
new file mode 100644
index 000000000..b9ef96675
--- /dev/null
+++ b/src/Text/Pandoc/Readers/DokuWiki.hs
@@ -0,0 +1,529 @@
+{-# LANGUAGE FlexibleContexts  #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TupleSections     #-}
+
+{-
+  Copyright (C) 2018-2019 Alexander Krotov <ilabdsf@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+-}
+
+{- |
+   Module      : Text.Pandoc.Readers.DokuWiki
+   Copyright   : Copyright (C) 2018 Alexander Krotov
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Alexander Krotov <ilabdsf@gmail.com>
+   Stability   : alpha
+   Portability : portable
+
+Conversion of DokuWiki text to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.DokuWiki (readDokuWiki) where
+
+import Prelude
+import Control.Monad
+import Control.Monad.Except (throwError)
+import Data.Char (isAlphaNum, isDigit)
+import qualified Data.Foldable as F
+import Data.List (intercalate, transpose, isPrefixOf, isSuffixOf)
+import Data.List.Split (splitOn)
+import Data.Maybe (fromMaybe, catMaybes)
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Class (PandocMonad (..))
+import Text.Pandoc.Definition
+import Text.Pandoc.Error (PandocError (PandocParsecError))
+import Text.Pandoc.Options
+import Text.Pandoc.Parsing hiding (enclosed, nested)
+import Text.Pandoc.Shared (crFilter, trim, underlineSpan)
+
+-- | Read DokuWiki from an input string and return a Pandoc document.
+readDokuWiki :: PandocMonad m
+             => ReaderOptions
+             -> Text
+             -> m Pandoc
+readDokuWiki opts s = do
+  let input = crFilter s
+  res <- runParserT parseDokuWiki def {stateOptions = opts } "source" input
+  case res of
+       Left e  -> throwError $ PandocParsecError (T.unpack input) e
+       Right d -> return d
+
+type DWParser = ParserT Text ParserState
+
+-- * Utility functions
+
+-- | Parse end-of-line, which can be either a newline or end-of-file.
+eol :: Stream s m Char => ParserT s st m ()
+eol = void newline <|> eof
+
+nested :: PandocMonad m => DWParser m a -> DWParser m a
+nested p = do
+  nestlevel <- stateMaxNestingLevel <$>  getState
+  guard $ nestlevel > 0
+  updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 }
+  res <- p
+  updateState $ \st -> st{ stateMaxNestingLevel = nestlevel }
+  return res
+
+guardColumnOne :: PandocMonad m => DWParser m ()
+guardColumnOne = getPosition >>= \pos -> guard (sourceColumn pos == 1)
+
+-- | Parse DokuWiki document.
+parseDokuWiki :: PandocMonad m => DWParser m Pandoc
+parseDokuWiki =
+  B.doc . mconcat <$> many block <* spaces <* eof
+
+-- | Parse <code> and <file> attributes
+codeLanguage :: PandocMonad m => DWParser m (String, [String], [(String, String)])
+codeLanguage = try $ do
+  rawLang <- option "-" (spaceChar *> manyTill anyChar (lookAhead (spaceChar <|> char '>')))
+  let attr = case rawLang of
+               "-" -> []
+               l -> [l]
+  return ("", attr, [])
+
+-- | Generic parser for <code> and <file> tags
+codeTag :: PandocMonad m
+        => ((String, [String], [(String, String)]) -> String -> a)
+        -> String
+        -> DWParser m a
+codeTag f tag = try $ f
+  <$  char '<'
+  <*  string tag
+  <*> codeLanguage
+  <*  manyTill anyChar (char '>')
+  <*  optional (manyTill spaceChar eol)
+  <*> manyTill anyChar (try $ string "</" <* string tag <* char '>')
+
+-- * Inline parsers
+
+-- | Parse any inline element but softbreak.
+inline' :: PandocMonad m => DWParser m B.Inlines
+inline' = whitespace
+      <|> br
+      <|> bold
+      <|> italic
+      <|> underlined
+      <|> nowiki
+      <|> percent
+      <|> link
+      <|> image
+      <|> monospaced
+      <|> subscript
+      <|> superscript
+      <|> deleted
+      <|> footnote
+      <|> inlineCode
+      <|> inlineFile
+      <|> inlineHtml
+      <|> inlinePhp
+      <|> autoLink
+      <|> autoEmail
+      <|> notoc
+      <|> nocache
+      <|> str
+      <|> symbol
+      <?> "inline"
+
+-- | Parse any inline element, including soft break.
+inline :: PandocMonad m => DWParser m B.Inlines
+inline = endline <|> inline'
+
+endline :: PandocMonad m => DWParser m B.Inlines
+endline = try $ B.softbreak <$ skipMany spaceChar <* linebreak
+
+whitespace :: PandocMonad m => DWParser m B.Inlines
+whitespace = try $ B.space <$ skipMany1 spaceChar
+
+br :: PandocMonad m => DWParser m B.Inlines
+br = try $ B.linebreak <$ string "\\\\" <* space
+
+linebreak :: PandocMonad m => DWParser m B.Inlines
+linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline)
+  where lastNewline  = mempty <$ eof
+        innerNewline = pure B.space
+
+between :: (Monoid c, PandocMonad m, Show b)
+        => DWParser m a -> DWParser m b -> (DWParser m b -> DWParser m c)
+        -> DWParser m c
+between start end p =
+  mconcat <$> try (start >> notFollowedBy whitespace >> many1Till (p end) end)
+
+enclosed :: (Monoid b, PandocMonad m, Show a)
+         => DWParser m a -> (DWParser m a -> DWParser m b) -> DWParser m b
+enclosed sep p = between sep (try sep) p
+
+nestedInlines :: (Show a, PandocMonad m)
+              => DWParser m a -> DWParser m B.Inlines
+nestedInlines end = innerSpace <|> nestedInline
+  where
+    innerSpace   = try $ whitespace <* notFollowedBy end
+    nestedInline = notFollowedBy whitespace >> nested inline
+
+bold :: PandocMonad m => DWParser m B.Inlines
+bold = try $ B.strong <$> enclosed (string "**") nestedInlines
+
+italic :: PandocMonad m => DWParser m B.Inlines
+italic = try $ B.emph <$> enclosed (string "//") nestedInlines
+
+underlined :: PandocMonad m => DWParser m B.Inlines
+underlined = try $ underlineSpan <$> enclosed (string "__") nestedInlines
+
+nowiki :: PandocMonad m => DWParser m B.Inlines
+nowiki = try $ B.text <$ string "<nowiki>" <*> manyTill anyChar (try $ string "</nowiki>")
+
+percent :: PandocMonad m => DWParser m B.Inlines
+percent = try $ B.text <$> enclosed (string "%%") nestedString
+
+nestedString :: (Show a, PandocMonad m)
+             => DWParser m a -> DWParser m String
+nestedString end = innerSpace <|> count 1 nonspaceChar
+  where
+    innerSpace = try $ many1 spaceChar <* notFollowedBy end
+
+monospaced :: PandocMonad m => DWParser m B.Inlines
+monospaced = try $ B.code <$> enclosed (string "''") nestedString
+
+subscript :: PandocMonad m => DWParser m B.Inlines
+subscript = try $ B.subscript <$> between (string "<sub>") (try $ string "</sub>") nestedInlines
+
+superscript :: PandocMonad m => DWParser m B.Inlines
+superscript = try $ B.superscript <$> between (string "<sup>") (try $ string "</sup>") nestedInlines
+
+deleted :: PandocMonad m => DWParser m B.Inlines
+deleted = try $ B.strikeout <$> between (string "<del>") (try $ string "</del>") nestedInlines
+
+-- | Parse a footnote.
+footnote :: PandocMonad m => DWParser m B.Inlines
+footnote = try $ B.note . B.para <$> between (string "((") (try $ string "))") nestedInlines
+
+inlineCode :: PandocMonad m => DWParser m B.Inlines
+inlineCode = codeTag B.codeWith "code"
+
+inlineFile :: PandocMonad m => DWParser m B.Inlines
+inlineFile = codeTag B.codeWith "file"
+
+inlineHtml :: PandocMonad m => DWParser m B.Inlines
+inlineHtml = try $ B.rawInline "html" <$ string "<html>" <*> manyTill anyChar (try $ string "</html>")
+
+inlinePhp :: PandocMonad m => DWParser m B.Inlines
+inlinePhp = try $ B.codeWith ("", ["php"], []) <$ string "<php>" <*> manyTill anyChar (try $ string "</php>")
+
+makeLink :: (String, String) -> B.Inlines
+makeLink (text, url) = B.link url "" $ B.str text
+
+autoEmail :: PandocMonad m => DWParser m B.Inlines
+autoEmail = try $ do
+  state <- getState
+  guard $ stateAllowLinks state
+  makeLink <$ char '<' <*> emailAddress <* char '>'
+
+autoLink :: PandocMonad m => DWParser m B.Inlines
+autoLink = try $ do
+  state <- getState
+  guard $ stateAllowLinks state
+  (text, url) <- uri
+  guard $ checkLink (last url)
+  return $ makeLink (text, url)
+  where
+    checkLink c
+      | c == '/' = True
+      | otherwise = isAlphaNum c
+
+notoc :: PandocMonad m => DWParser m B.Inlines
+notoc = try $ mempty <$ string "~~NOTOC~~"
+
+nocache :: PandocMonad m => DWParser m B.Inlines
+nocache = try $ mempty <$ string "~~NOCACHE~~"
+
+str :: PandocMonad m => DWParser m B.Inlines
+str = B.str <$> (many1 alphaNum <|> count 1 characterReference)
+
+symbol :: PandocMonad m => DWParser m B.Inlines
+symbol = B.str <$> count 1 nonspaceChar
+
+link :: PandocMonad m => DWParser m B.Inlines
+link = try $ do
+  st <- getState
+  guard $ stateAllowLinks st
+  setState $ st{ stateAllowLinks = False }
+  l <- linkText
+  setState $ st{ stateAllowLinks = True }
+  return l
+
+isExternalLink :: String -> Bool
+isExternalLink s =
+  case dropWhile (\c -> isAlphaNum c || (c `elem` ['-', '.', '+'])) s of
+    (':':'/':'/':_) -> True
+    _ -> False
+
+isAbsolutePath :: String -> Bool
+isAbsolutePath ('.':_) = False
+isAbsolutePath s = ':' `elem` s
+
+normalizeDots :: String -> String
+normalizeDots path@('.':_) =
+  case dropWhile (== '.') path of
+    ':':_ -> path
+    _ -> takeWhile (== '.') path ++ ':':dropWhile (== '.') path
+normalizeDots path = path
+
+normalizeInternalPath :: String -> String
+normalizeInternalPath path =
+  if isAbsolutePath path
+    then ensureAbsolute normalizedPath
+    else normalizedPath
+  where
+    normalizedPath = intercalate "/" $ dropWhile (== ".") $ splitOn ":" $ normalizeDots path
+    ensureAbsolute s@('/':_) = s
+    ensureAbsolute s = '/':s
+
+normalizePath :: String -> String
+normalizePath path =
+  if isExternalLink path
+    then path
+    else normalizeInternalPath path
+
+urlToText :: String -> String
+urlToText url =
+  if isExternalLink url
+    then url
+    else reverse $ takeWhile (/= ':') $ reverse url
+
+-- Parse link or image
+parseLink :: PandocMonad m
+          => (String -> Maybe B.Inlines -> B.Inlines)
+          -> String
+          -> String
+          -> DWParser m B.Inlines
+parseLink f l r = f
+  <$  string l
+  <*> many1Till anyChar (lookAhead (void (char '|') <|> try (void $ string r)))
+  <*> optionMaybe (B.trimInlines . mconcat <$> (char '|' *> manyTill inline (try $ lookAhead $ string r)))
+  <*  string r
+
+-- | Split Interwiki link into left and right part
+-- | Return Nothing if it is not Interwiki link
+splitInterwiki :: String -> Maybe (String, String)
+splitInterwiki path =
+  case span (\c -> isAlphaNum c || c == '.') path of
+    (l, '>':r) -> Just (l, r)
+    _ -> Nothing
+
+interwikiToUrl :: String -> String -> String
+interwikiToUrl "callto" page = "callto://" ++ page
+interwikiToUrl "doku" page = "https://www.dokuwiki.org/" ++ page
+interwikiToUrl "phpfn" page = "https://secure.php.net/" ++ page
+interwikiToUrl "tel" page = "tel:" ++ page
+interwikiToUrl "wp" page = "https://en.wikipedia.org/wiki/" ++ page
+interwikiToUrl "wpde" page = "https://de.wikipedia.org/wiki/" ++ page
+interwikiToUrl "wpes" page = "https://es.wikipedia.org/wiki/" ++ page
+interwikiToUrl "wpfr" page = "https://fr.wikipedia.org/wiki/" ++ page
+interwikiToUrl "wpjp" page = "https://jp.wikipedia.org/wiki/" ++ page
+interwikiToUrl "wppl" page = "https://pl.wikipedia.org/wiki/" ++ page
+interwikiToUrl _ page = "https://www.google.com/search?q=" ++ page ++ "&btnI=lucky"
+
+linkText :: PandocMonad m => DWParser m B.Inlines
+linkText = parseLink fromRaw "[[" "]]"
+  where
+    fromRaw path description =
+      B.link normalizedPath "" (fromMaybe (B.str defaultDescription) description)
+      where
+        path' = trim path
+        interwiki = splitInterwiki path'
+        normalizedPath =
+          case interwiki of
+            Nothing -> normalizePath path'
+            Just (l, r) -> interwikiToUrl l r
+        defaultDescription =
+          case interwiki of
+            Nothing -> urlToText path'
+            Just (_, r) -> r
+
+-- Matches strings like "100x100" (width x height) and "50" (width)
+isWidthHeightParameter :: String -> Bool
+isWidthHeightParameter s =
+  case s of
+    (x:xs) ->
+      isDigit x && case dropWhile isDigit xs of
+                     ('x':ys@(_:_)) -> all isDigit ys
+                     "" -> True
+                     _ -> False
+    _ -> False
+
+parseWidthHeight :: String -> (Maybe String, Maybe String)
+parseWidthHeight s = (width, height)
+  where
+    width = Just $ takeWhile isDigit s
+    height =
+      case dropWhile isDigit s of
+        ('x':xs) -> Just xs
+        _ -> Nothing
+
+image :: PandocMonad m => DWParser m B.Inlines
+image = parseLink fromRaw "{{" "}}"
+  where
+    fromRaw path description =
+      if linkOnly
+        then B.link normalizedPath "" (fromMaybe defaultDescription description)
+        else B.imageWith ("", classes, attributes) normalizedPath "" (fromMaybe defaultDescription description)
+      where
+        (path', parameters) = span (/= '?') $ trim path
+        normalizedPath = normalizePath path'
+        leftPadding = " " `isPrefixOf` path
+        rightPadding = " " `isSuffixOf` path
+        classes =
+          case (leftPadding, rightPadding) of
+            (False, False) -> []
+            (False, True) -> ["align-left"]
+            (True, False) -> ["align-right"]
+            (True, True) -> ["align-center"]
+        parameterList = splitOn "&" $ drop 1 parameters
+        linkOnly = "linkonly" `elem` parameterList
+        (width, height) = maybe (Nothing, Nothing) parseWidthHeight (F.find isWidthHeightParameter parameterList)
+        attributes = catMaybes [fmap ("width",) width, fmap ("height",) height]
+        defaultDescription = B.str $ urlToText path'
+
+-- * Block parsers
+
+block :: PandocMonad m => DWParser m B.Blocks
+block = do
+  res <- mempty <$ skipMany1 blankline
+         <|> blockElements
+         <|> para
+  skipMany blankline
+  trace (take 60 $ show $ B.toList res)
+  return res
+
+blockElements :: PandocMonad m => DWParser m B.Blocks
+blockElements = horizontalLine
+            <|> header
+            <|> list "  "
+            <|> indentedCode
+            <|> quote
+            <|> blockCode
+            <|> blockFile
+            <|> blockHtml
+            <|> blockPhp
+            <|> table
+
+horizontalLine :: PandocMonad m => DWParser m B.Blocks
+horizontalLine = try $ B.horizontalRule <$ string "---" <* many1 (char '-') <* eol
+
+header :: PandocMonad m => DWParser m B.Blocks
+header = try $ do
+  guardColumnOne
+  eqs <- many1 (char '=')
+  let lev = length eqs
+  guard $ lev < 7
+  contents <- B.trimInlines . mconcat <$> manyTill inline (try $ char '=' *> many1 (char '='))
+  attr <- registerHeader nullAttr contents
+  return $ B.headerWith attr (7 - lev) contents
+
+list :: PandocMonad m => String -> DWParser m B.Blocks
+list prefix = bulletList prefix <|> orderedList prefix
+
+bulletList :: PandocMonad m => String -> DWParser m B.Blocks
+bulletList prefix = try $ B.bulletList <$> parseList prefix '*'
+
+orderedList :: PandocMonad m => String -> DWParser m B.Blocks
+orderedList prefix = try $ B.orderedList <$> parseList prefix '-'
+
+parseList :: PandocMonad m
+          => String
+          -> Char
+          -> DWParser m [B.Blocks]
+parseList prefix marker =
+  many1 ((<>) <$> item <*> fmap mconcat (many continuation))
+  where
+    continuation = try $ list ("  " ++ prefix)
+    item = try $ string prefix *> char marker *> char ' ' *> itemContents
+    itemContents = B.plain . mconcat <$> many1Till inline' eol
+
+indentedCode :: PandocMonad m => DWParser m B.Blocks
+indentedCode = try $ B.codeBlock . unlines <$> many1 indentedLine
+ where
+   indentedLine = try $ string "  " *> manyTill anyChar eol
+
+quote :: PandocMonad m => DWParser m B.Blocks
+quote = try $ nestedQuote 0
+  where
+    prefix level = count level (char '>')
+    contents level = nestedQuote level <|> quoteLine
+    quoteLine = try $ B.plain . B.trimInlines . mconcat <$> many1Till inline' eol
+    quoteContents level = (<>) <$> contents level <*> quoteContinuation level
+    quoteContinuation level = mconcat <$> many (try $ prefix level *> contents level)
+    nestedQuote level = B.blockQuote <$ char '>' <*> quoteContents (level + 1 :: Int)
+
+blockHtml :: PandocMonad m => DWParser m B.Blocks
+blockHtml = try $ B.rawBlock "html"
+  <$  string "<HTML>"
+  <*  optional (manyTill spaceChar eol)
+  <*> manyTill anyChar (try $ string "</HTML>")
+
+blockPhp :: PandocMonad m => DWParser m B.Blocks
+blockPhp = try $ B.codeBlockWith ("", ["php"], [])
+  <$  string "<PHP>"
+  <*  optional (manyTill spaceChar eol)
+  <*> manyTill anyChar (try $ string "</PHP>")
+
+table :: PandocMonad m => DWParser m B.Blocks
+table = do
+  firstSeparator <- lookAhead tableCellSeparator
+  rows <- tableRows
+  let (headerRow, body) = if firstSeparator == '^'
+                            then (head rows, tail rows)
+                            else ([], rows)
+  let attrs = const (AlignDefault, 0.0) <$> transpose rows
+  pure $ B.table mempty attrs headerRow body
+
+tableRows :: PandocMonad m => DWParser m [[B.Blocks]]
+tableRows = many1 tableRow
+
+tableRow :: PandocMonad m => DWParser m [B.Blocks]
+tableRow = many1Till tableCell tableRowEnd
+
+tableRowEnd :: PandocMonad m => DWParser m Char
+tableRowEnd = try $ tableCellSeparator <* manyTill spaceChar eol
+
+tableCellSeparator :: PandocMonad m => DWParser m Char
+tableCellSeparator = char '|' <|> char '^'
+
+tableCell :: PandocMonad m => DWParser m B.Blocks
+tableCell = try $ B.plain . B.trimInlines . mconcat <$> (normalCell <|> headerCell)
+  where
+    normalCell = char '|' *> manyTill inline' (lookAhead tableCellSeparator)
+    headerCell = char '^' *> manyTill inline' (lookAhead tableCellSeparator)
+
+blockCode :: PandocMonad m => DWParser m B.Blocks
+blockCode = codeTag B.codeBlockWith "code"
+
+blockFile :: PandocMonad m => DWParser m B.Blocks
+blockFile = codeTag B.codeBlockWith "file"
+
+para :: PandocMonad m => DWParser m B.Blocks
+para = result . mconcat <$> many1Till inline endOfParaElement
+ where
+   endOfParaElement = lookAhead $ endOfInput <|> endOfPara <|> newBlockElement
+   endOfInput       = try $ skipMany blankline >> skipSpaces >> eof
+   endOfPara        = try $ blankline >> skipMany1 blankline
+   newBlockElement  = try $ blankline >> void blockElements
+   result content   = if F.all (==Space) content
+                      then mempty
+                      else B.para $ B.trimInlines content
author	Alexander <ilabdsf@gmail.com>	2019-01-06 23:06:32 +0000
committer	John MacFarlane <jgm@berkeley.edu>	2019-01-06 15:06:32 -0800
commit	40c30a9d88101e2a00b904597c6ad4d8f8bce60b (patch)
tree	e18e2a5474c0c97b5ca8b40915fa92ac1beae985 /src/Text/Pandoc
parent	a5f5002eefd43820b1f54191b0cf3cbd743eca0a (diff)
download	pandoc-40c30a9d88101e2a00b904597c6ad4d8f8bce60b.tar.gz