From 66fd3247eac9a564aca179ca9fe4fd007ee471e7 Mon Sep 17 00:00:00 2001 From: Sascha Wilde Date: Fri, 27 Oct 2017 01:19:28 +0200 Subject: Creole reader (#3994) This is feature complete but not very thoroughly tested yet. --- src/Text/Pandoc/Readers.hs | 3 + src/Text/Pandoc/Readers/Creole.hs | 316 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 319 insertions(+) create mode 100644 src/Text/Pandoc/Readers/Creole.hs (limited to 'src/Text/Pandoc') diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index 92a185e0d..9fc9c3d18 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -45,6 +45,7 @@ module Text.Pandoc.Readers , readOdt , readMarkdown , readCommonMark + , readCreole , readMediaWiki , readVimwiki , readRST @@ -76,6 +77,7 @@ import Text.Pandoc.Error import Text.Pandoc.Extensions import Text.Pandoc.Options import Text.Pandoc.Readers.CommonMark +import Text.Pandoc.Readers.Creole import Text.Pandoc.Readers.DocBook import Text.Pandoc.Readers.Docx import Text.Pandoc.Readers.EPUB @@ -117,6 +119,7 @@ readers = [ ("native" , TextReader readNative) ,("markdown_github" , TextReader readMarkdown) ,("markdown_mmd", TextReader readMarkdown) ,("commonmark" , TextReader readCommonMark) + ,("creole" , TextReader readCreole) ,("gfm" , TextReader readCommonMark) ,("rst" , TextReader readRST) ,("mediawiki" , TextReader readMediaWiki) diff --git a/src/Text/Pandoc/Readers/Creole.hs b/src/Text/Pandoc/Readers/Creole.hs new file mode 100644 index 000000000..ab90772ef --- /dev/null +++ b/src/Text/Pandoc/Readers/Creole.hs @@ -0,0 +1,316 @@ +{- + Copyright (C) 2017 Sascha Wilde + + partly based on all the other readers, especialy the work by + John MacFarlane and + Alexander Sulfrian + all bugs are solely created by me. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Creole + Copyright : Copyright (C) 2017 Sascha Wilde + License : GNU GPL, version 2 or above + + Maintainer : Sascha Wilde + Stability : WIP + Portability : portable + +Conversion of creole text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Creole ( readCreole + ) where + +import Control.Monad.Except (liftM2, throwError, guard) +import qualified Data.Foldable as F +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class (PandocMonad(..)) +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (enclosed) +import Text.Pandoc.Shared (crFilter) +import Data.Maybe (fromMaybe) +import Data.Monoid +import Data.Text (Text) +import qualified Data.Text as T + + +-- | Read creole from an input string and return a Pandoc document. +readCreole :: PandocMonad m + => ReaderOptions + -> Text + -> m Pandoc +readCreole opts s = do + res <- readWithM parseCreole def{ stateOptions = opts } + (T.unpack (crFilter s) ++ "\n\n") + case res of + Left e -> throwError e + Right d -> return d + +type CRLParser = ParserT [Char] ParserState + +-- +-- Utility funcitons +-- + +(<+>) :: (Monad m, Monoid a) => m a -> m a -> m a +(<+>) = liftM2 (<>) + +-- we have to redefine `enclosed' from Text.Pandoc.Parsing, because it +-- assumes, that there can't be a space after the start parser, but +-- with creole this is possible. +enclosed :: (Show end, PandocMonad m) => CRLParser m start -- ^ start parser + -> CRLParser m end -- ^ end parser + -> CRLParser m a -- ^ content parser (to be used repeatedly) + -> CRLParser m [a] +enclosed start end parser = try $ start >> many1Till parser end + +-- +-- main parser +-- + +specialChars :: [Char] +specialChars = "*/~{}\\|[]()<>\"'" + +parseCreole :: PandocMonad m => CRLParser m Pandoc +parseCreole = do + bs <- mconcat <$> many block + spaces + eof + return $ B.doc bs + + +-- +-- block parsers +-- + +block :: PandocMonad m => CRLParser m B.Blocks +block = do + res <- mempty <$ skipMany1 blankline + <|> nowiki + <|> header + <|> horizontalRule + <|> anyList 1 + <|> table + <|> para + skipMany blankline + return res + +nowiki :: PandocMonad m => CRLParser m B.Blocks +nowiki = try $ nowikiStart >> manyTill content nowikiEnd >>= return . B.codeBlock . mconcat + where + content = brackets <|> line + brackets = try $ option "" ((:[]) <$> newline) + <+> (char ' ' >> (many (char ' ') <+> string "}}}") <* eol) + line = option "" ((:[]) <$> newline) <+> manyTill anyChar eol + eol = lookAhead $ try $ nowikiEnd <|> newline + nowikiStart = optional newline >> string "{{{" >> skipMany spaceChar >> newline + nowikiEnd = try $ linebreak >> string "}}}" >> skipMany spaceChar >> newline + +header :: PandocMonad m => CRLParser m B.Blocks +header = try $ do + skipSpaces + level <- many1 (char '=') >>= return . length + guard $ level <= 6 + skipSpaces + content <- B.str <$> manyTill (noneOf "\n") headerEnd + return $ B.header level content + where + headerEnd = try $ skipSpaces >> many (char '=') >> skipSpaces >> newline + +unorderedList :: PandocMonad m => Int -> CRLParser m B.Blocks +unorderedList = list '*' B.bulletList + +orderedList :: PandocMonad m => Int -> CRLParser m B.Blocks +orderedList = list '#' B.orderedList + +anyList :: PandocMonad m => Int -> CRLParser m B.Blocks +anyList n = unorderedList n <|> orderedList n + +anyListItem :: PandocMonad m => Int -> CRLParser m B.Blocks +anyListItem n = listItem '*' n <|> listItem '#' n + +list :: PandocMonad m => Char -> ([B.Blocks] -> B.Blocks) -> Int -> CRLParser m B.Blocks +list c f n = many1 (itemPlusSublist <|> listItem c n) + >>= return . f + where itemPlusSublist = try $ listItem c n <+> anyList (n+1) + +listItem :: PandocMonad m => Char -> Int -> CRLParser m B.Blocks +listItem c n = (listStart >> many1Till inline itemEnd) + >>= return . B.plain . B.trimInlines .mconcat + where + listStart = try $ optional newline >> skipSpaces >> count n (char c) + >> (lookAhead $ noneOf [c]) >> skipSpaces + itemEnd = endOfParaElement <|> nextItem n + <|> if n < 3 then nextItem (n+1) + else nextItem (n+1) <|> nextItem (n-1) + nextItem x = lookAhead $ try $ blankline >> anyListItem x >> return mempty + +table :: PandocMonad m => CRLParser m B.Blocks +table = try $ do + headers <- optionMaybe headerRow + rows <- many1 row + return $ B.simpleTable (fromMaybe [mempty] headers) rows + where + headerRow = try $ skipSpaces >> many1Till headerCell rowEnd + headerCell = B.plain . B.trimInlines . mconcat + <$> (string "|=" >> many1Till inline cellEnd) + row = try $ skipSpaces >> many1Till cell rowEnd + cell = B.plain . B.trimInlines . mconcat + <$> (char '|' >> many1Till inline cellEnd) + rowEnd = try $ optional (char '|') >> skipSpaces >> newline + cellEnd = lookAhead $ try $ char '|' <|> rowEnd + +para :: PandocMonad m => CRLParser m B.Blocks +para = many1Till inline endOfParaElement >>= return . result . mconcat + where + result content = if F.all (==Space) content + then mempty + else B.para $ B.trimInlines content + +endOfParaElement :: PandocMonad m => CRLParser m () +endOfParaElement = lookAhead $ endOfInput <|> endOfPara + <|> startOfList <|> startOfTable + <|> startOfHeader <|> hr <|> startOfNowiki + where + endOfInput = try $ skipMany blankline >> skipSpaces >> eof + endOfPara = try $ blankline >> skipMany1 blankline + startOf :: PandocMonad m => CRLParser m a -> CRLParser m () + startOf p = try $ blankline >> p >> return mempty + startOfList = startOf $ anyList 1 + startOfTable = startOf $ table + startOfHeader = startOf header + startOfNowiki = startOf nowiki + hr = startOf horizontalRule + +horizontalRule :: PandocMonad m => CRLParser m B.Blocks +horizontalRule = try $ skipSpaces >> string "----" >> skipSpaces >> newline + >> return B.horizontalRule + +-- +-- inline parsers +-- + +inline :: PandocMonad m => CRLParser m B.Inlines +inline = choice [ whitespace + , escapedLink + , escapedChar + , link + , inlineNowiki + , placeholder + , image + , forcedLinebreak + , bold + , finalBold + , italics + , finalItalics + , str + , symbol + ] "inline" + +escapedChar :: PandocMonad m => CRLParser m B.Inlines +escapedChar = (try $ char '~' >> noneOf "\t\n ") >>= return . B.str . (:[]) + +escapedLink :: PandocMonad m => CRLParser m B.Inlines +escapedLink = try $ do + char '~' + (orig, _) <- uri + return $ B.str orig + +image :: PandocMonad m => CRLParser m B.Inlines +image = try $ do + (orig, src) <- wikiImg + return $ B.image src "" (B.str $ orig) + where + linkSrc = many $ noneOf "|}\n\r\t" + linkDsc = char '|' >> many (noneOf "}\n\r\t") + wikiImg = try $ do + string "{{" + src <- linkSrc + dsc <- option "" linkDsc + string "}}" + return (dsc, src) + +link :: PandocMonad m => CRLParser m B.Inlines +link = try $ do + (orig, src) <- uriLink <|> wikiLink + return $ B.link src "" orig + where + linkSrc = many $ noneOf "|]\n\r\t" + linkDsc :: PandocMonad m => String -> CRLParser m B.Inlines + linkDsc otxt = B.str + <$> (try $ option otxt + (char '|' >> many (noneOf "]\n\r\t"))) + linkImg = try $ char '|' >> image + wikiLink = try $ do + string "[[" + src <- linkSrc + dsc <- linkImg <|> linkDsc src + string "]]" + return (dsc, src) + uriLink = try $ do + (orig, src) <- uri + return (B.str orig, src) + +inlineNowiki :: PandocMonad m => CRLParser m B.Inlines +inlineNowiki = B.code <$> (start >> manyTill (noneOf "\n\r") end) + where + start = try $ string "{{{" + end = try $ string "}}}" >> (lookAhead $ noneOf "}") + +placeholder :: PandocMonad m => CRLParser m B.Inlines +-- The semantics of the placeholder is basicallly implementation +-- dependent, so there is no way to DTRT for all cases. +-- So for now we just drop them. +placeholder = B.text <$> (try $ string "<<<" >> manyTill anyChar (string ">>>") + >> return "") + +whitespace :: PandocMonad m => CRLParser m B.Inlines +whitespace = (lb <|> regsp) >>= return + where lb = try $ skipMany spaceChar >> linebreak >> return B.space + regsp = try $ skipMany1 spaceChar >> return B.space + +linebreak :: PandocMonad m => CRLParser m B.Inlines +linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline) + where lastNewline = eof >> return mempty + innerNewline = return B.space + +symbol :: PandocMonad m => CRLParser m B.Inlines +symbol = oneOf specialChars >>= return . B.str . (:[]) + +str :: PandocMonad m => CRLParser m B.Inlines +str = let strChar = noneOf ("\t\n " ++ specialChars) in + many1 strChar >>= return . B.str + +bold :: PandocMonad m => CRLParser m B.Inlines +bold = B.strong . mconcat <$> + enclosed (string "**") (try $ string "**") inline + +italics :: PandocMonad m => CRLParser m B.Inlines +italics = B.emph . mconcat <$> + enclosed (string "//") (try $ string "//") inline + +finalBold :: PandocMonad m => CRLParser m B.Inlines +finalBold = B.strong . mconcat <$> + try (string "**" >> many1Till inline endOfParaElement) + +finalItalics :: PandocMonad m => CRLParser m B.Inlines +finalItalics = B.emph . mconcat <$> + try (string "//" >> many1Till inline endOfParaElement) + +forcedLinebreak :: PandocMonad m => CRLParser m B.Inlines +forcedLinebreak = try $ string "\\\\" >> return B.linebreak -- cgit v1.2.3