Add Text.Pandoc.Readers.CSV (readCSV).

This adds csv as an input format. The CSV table is converted into a pandoc simple table. Closes #6100.
author: John MacFarlane <jgm@berkeley.edu> 2020-01-31 21:14:21 -0800
committer: John MacFarlane <jgm@berkeley.edu> 2020-01-31 21:14:21 -0800
commit: f9514ccb9e6efbd33ad84fcc1dfef20603affc4c (patch)
tree: 4da927d6dd9e253d02efb08f30b211d865f33d86 /src/Text
parent: cef30e738460c177e74df39166b351bec75857ea (diff)
download: pandoc-f9514ccb9e6efbd33ad84fcc1dfef20603affc4c.tar.gz
3 files changed, 112 insertions, 0 deletions
diff --git a/src/Text/Pandoc/App/FormatHeuristics.hs b/src/Text/Pandoc/App/FormatHeuristics.hs
index 25e0a303e..21de09f3d 100644
--- a/src/Text/Pandoc/App/FormatHeuristics.hs
+++ b/src/Text/Pandoc/App/FormatHeuristics.hs
@@ -75,5 +75,6 @@ formatFromFilePath x =
     ".wiki"     -> Just "mediawiki"
     ".xhtml"    -> Just "html"
     ".ipynb"    -> Just "ipynb"
+    ".csv"      -> Just "csv"
     ['.',y]     | y `elem` ['1'..'9'] -> Just "man"
     _           -> Nothing
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 990e78f35..28c7c6810 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -51,6 +51,7 @@ module Text.Pandoc.Readers
   , readMuse
   , readFB2
   , readIpynb
+  , readCSV
   -- * Miscellaneous
   , getReader
   , getDefaultExtensions
@@ -95,6 +96,7 @@ import Text.Pandoc.Readers.TWiki
 import Text.Pandoc.Readers.Txt2Tags
 import Text.Pandoc.Readers.Vimwiki
 import Text.Pandoc.Readers.Man
+import Text.Pandoc.Readers.CSV
 import qualified Text.Pandoc.UTF8 as UTF8
 import Text.Parsec.Error
 
@@ -136,6 +138,7 @@ readers = [ ("native"       , TextReader readNative)
            ,("man"          , TextReader readMan)
            ,("fb2"          , TextReader readFB2)
            ,("ipynb"        , TextReader readIpynb)
+           ,("csv"          , TextReader readCSV)
            ]
 
 -- | Retrieve reader, extensions based on formatSpec (format+extensions).
diff --git a/src/Text/Pandoc/Readers/CSV.hs b/src/Text/Pandoc/Readers/CSV.hs
new file mode 100644
index 000000000..3eea9ccbd
--- /dev/null
+++ b/src/Text/Pandoc/Readers/CSV.hs
@@ -0,0 +1,108 @@
+{-# LANGUAGE NoImplicitPrelude   #-}
+{-# LANGUAGE FlexibleContexts    #-}
+{-# LANGUAGE OverloadedStrings   #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE ViewPatterns        #-}
+{- |
+   Module      : Text.Pandoc.Readers.RST
+   Copyright   : Copyright (C) 2006-2019 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+
+Conversion from CSV to a 'Pandoc' table.
+-}
+module Text.Pandoc.Readers.CSV ( readCSV ) where
+import Prelude
+import Data.Text (Text)
+import qualified Data.Text as T
+import Text.Pandoc.Definition
+import qualified Text.Pandoc.Builder as B
+import Text.Parsec
+import Text.Parsec.Text (Parser)
+import Text.Pandoc.Class (PandocMonad)
+import Text.Pandoc.Shared (crFilter)
+import Text.Pandoc.Error
+import Text.Pandoc.Options (ReaderOptions(..))
+import Control.Monad.Except (throwError)
+
+readCSV :: PandocMonad m
+        => ReaderOptions -- ^ Reader options
+        -> Text          -- ^ Text to parse (assuming @'\n'@ line endings)
+        -> m Pandoc
+readCSV opts s = do
+  let columns = readerColumns opts
+  case parse pCSV "input" (crFilter s) of
+    Right (r:rs) -> return $ B.doc $ B.table capt (zip aligns widths) hdrs rows
+       where capt = mempty
+             numcols = length r
+             hdrs = map (B.plain . B.text) r
+             rows = map (map (B.plain . B.text)) rs
+             maximum' [] = 0
+             maximum' xs = maximum xs
+             aligns = replicate numcols AlignDefault
+             widths = replicate numcols 0
+    Right []     -> return $ B.doc mempty
+    Left e       -> throwError $ PandocParsecError s e
+
+{- from RFC 4180
+
+   The ABNF grammar [2] appears as follows:
+
+   file = [header CRLF] record *(CRLF record) [CRLF]
+
+   header = name *(COMMA name)
+
+   record = field *(COMMA field)
+
+   name = field
+
+   field = (escaped / non-escaped)
+
+   escaped = DQUOTE *(TEXTDATA / COMMA / CR / LF / 2DQUOTE) DQUOTE
+
+   non-escaped = *TEXTDATA
+
+   COMMA = %x2C
+
+   CR = %x0D ;as per section 6.1 of RFC 2234 [2]
+
+   DQUOTE =  %x22 ;as per section 6.1 of RFC 2234 [2]
+
+   LF = %x0A ;as per section 6.1 of RFC 2234 [2]
+
+   CRLF = CR LF ;as per section 6.1 of RFC 2234 [2]
+
+   TEXTDATA =  %x20-21 / %x23-2B / %x2D-7E
+-}
+
+pCSV :: Parser [[Text]]
+pCSV = many pRecord
+
+pRecord :: Parser [Text]
+pRecord = do
+  x <- pField
+  xs <- many $ pComma >> pField
+  () <$ newline <|> eof
+  return (x:xs)
+
+pField :: Parser Text
+pField = pEscaped <|> pUnescaped
+
+pComma :: Parser Char
+pComma = char ','
+
+pUnescaped :: Parser Text
+pUnescaped = T.strip . T.pack <$> many1 (noneOf "\n\r\",")
+
+pEscaped :: Parser Text
+pEscaped = do
+  char '"'
+  t <- T.pack <$> many (pDoubledQuote <|> noneOf "\"")
+  char '"'
+  return t
+
+pDoubledQuote :: Parser Char
+pDoubledQuote = try $ char '"' >> char '"'
author	John MacFarlane <jgm@berkeley.edu>	2020-01-31 21:14:21 -0800
committer	John MacFarlane <jgm@berkeley.edu>	2020-01-31 21:14:21 -0800
commit	f9514ccb9e6efbd33ad84fcc1dfef20603affc4c (patch)
tree	4da927d6dd9e253d02efb08f30b211d865f33d86 /src/Text
parent	cef30e738460c177e74df39166b351bec75857ea (diff)
download	pandoc-f9514ccb9e6efbd33ad84fcc1dfef20603affc4c.tar.gz