aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2020-01-31 21:14:21 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2020-01-31 21:14:21 -0800
commitf9514ccb9e6efbd33ad84fcc1dfef20603affc4c (patch)
tree4da927d6dd9e253d02efb08f30b211d865f33d86
parentcef30e738460c177e74df39166b351bec75857ea (diff)
downloadpandoc-f9514ccb9e6efbd33ad84fcc1dfef20603affc4c.tar.gz
Add Text.Pandoc.Readers.CSV (readCSV).
This adds csv as an input format. The CSV table is converted into a pandoc simple table. Closes #6100.
-rw-r--r--MANUAL.txt2
-rw-r--r--pandoc.cabal5
-rw-r--r--src/Text/Pandoc/App/FormatHeuristics.hs1
-rw-r--r--src/Text/Pandoc/Readers.hs3
-rw-r--r--src/Text/Pandoc/Readers/CSV.hs108
-rw-r--r--test/command/csv.md17
6 files changed, 134 insertions, 2 deletions
diff --git a/MANUAL.txt b/MANUAL.txt
index a776fab64..c90ac7b01 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -227,6 +227,7 @@ header when requesting a document from a URL:
::: {#input-formats}
- `commonmark` ([CommonMark] Markdown)
- `creole` ([Creole 1.0])
+ - `csv` ([CSV] table)
- `docbook` ([DocBook])
- `docx` ([Word docx])
- `dokuwiki` ([DokuWiki markup])
@@ -467,6 +468,7 @@ header when requesting a document from a URL:
[TikiWiki markup]: https://doc.tiki.org/Wiki-Syntax-Text#The_Markup_Language_Wiki-Syntax
[Haddock markup]: https://www.haskell.org/haddock/doc/html/ch03s08.html
[Creole 1.0]: http://www.wikicreole.org/wiki/Creole1.0
+[CSV]: https://tools.ietf.org/html/rfc4180
[roff man]: https://man.cx/groff_man(7)
[roff ms]: https://man.cx/groff_ms(7)
[Haskell]: https://www.haskell.org
diff --git a/pandoc.cabal b/pandoc.cabal
index 1b50ddd7c..3b8e84634 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -1,5 +1,5 @@
name: pandoc
-version: 2.9.1.1
+version: 2.9.2
cabal-version: 2.0
build-type: Simple
license: GPL-2
@@ -21,7 +21,7 @@ description: Pandoc is a Haskell library for converting from one markup
TikiWiki markup, Jira markup, Creole 1.0, Haddock markup,
OPML, Emacs Org-Mode, Emacs Muse, txt2tags, ipynb (Jupyter
notebooks), Vimwiki, Word Docx, ODT, EPUB, FictionBook2,
- roff man, and Textile, and it can write Markdown,
+ roff man, Textile, and CSV, and it can write Markdown,
reStructuredText, XHTML, HTML 5, LaTeX, ConTeXt, DocBook,
JATS, OPML, TEI, OpenDocument, ODT, Word docx,
PowerPoint pptx, RTF, MediaWiki, DokuWiki, XWiki,
@@ -503,6 +503,7 @@ library
Text.Pandoc.Readers.FB2,
Text.Pandoc.Readers.DokuWiki,
Text.Pandoc.Readers.Ipynb,
+ Text.Pandoc.Readers.CSV,
Text.Pandoc.Writers,
Text.Pandoc.Writers.Native,
Text.Pandoc.Writers.Docbook,
diff --git a/src/Text/Pandoc/App/FormatHeuristics.hs b/src/Text/Pandoc/App/FormatHeuristics.hs
index 25e0a303e..21de09f3d 100644
--- a/src/Text/Pandoc/App/FormatHeuristics.hs
+++ b/src/Text/Pandoc/App/FormatHeuristics.hs
@@ -75,5 +75,6 @@ formatFromFilePath x =
".wiki" -> Just "mediawiki"
".xhtml" -> Just "html"
".ipynb" -> Just "ipynb"
+ ".csv" -> Just "csv"
['.',y] | y `elem` ['1'..'9'] -> Just "man"
_ -> Nothing
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 990e78f35..28c7c6810 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -51,6 +51,7 @@ module Text.Pandoc.Readers
, readMuse
, readFB2
, readIpynb
+ , readCSV
-- * Miscellaneous
, getReader
, getDefaultExtensions
@@ -95,6 +96,7 @@ import Text.Pandoc.Readers.TWiki
import Text.Pandoc.Readers.Txt2Tags
import Text.Pandoc.Readers.Vimwiki
import Text.Pandoc.Readers.Man
+import Text.Pandoc.Readers.CSV
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Parsec.Error
@@ -136,6 +138,7 @@ readers = [ ("native" , TextReader readNative)
,("man" , TextReader readMan)
,("fb2" , TextReader readFB2)
,("ipynb" , TextReader readIpynb)
+ ,("csv" , TextReader readCSV)
]
-- | Retrieve reader, extensions based on formatSpec (format+extensions).
diff --git a/src/Text/Pandoc/Readers/CSV.hs b/src/Text/Pandoc/Readers/CSV.hs
new file mode 100644
index 000000000..3eea9ccbd
--- /dev/null
+++ b/src/Text/Pandoc/Readers/CSV.hs
@@ -0,0 +1,108 @@
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE ViewPatterns #-}
+{- |
+ Module : Text.Pandoc.Readers.RST
+ Copyright : Copyright (C) 2006-2019 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion from CSV to a 'Pandoc' table.
+-}
+module Text.Pandoc.Readers.CSV ( readCSV ) where
+import Prelude
+import Data.Text (Text)
+import qualified Data.Text as T
+import Text.Pandoc.Definition
+import qualified Text.Pandoc.Builder as B
+import Text.Parsec
+import Text.Parsec.Text (Parser)
+import Text.Pandoc.Class (PandocMonad)
+import Text.Pandoc.Shared (crFilter)
+import Text.Pandoc.Error
+import Text.Pandoc.Options (ReaderOptions(..))
+import Control.Monad.Except (throwError)
+
+readCSV :: PandocMonad m
+ => ReaderOptions -- ^ Reader options
+ -> Text -- ^ Text to parse (assuming @'\n'@ line endings)
+ -> m Pandoc
+readCSV opts s = do
+ let columns = readerColumns opts
+ case parse pCSV "input" (crFilter s) of
+ Right (r:rs) -> return $ B.doc $ B.table capt (zip aligns widths) hdrs rows
+ where capt = mempty
+ numcols = length r
+ hdrs = map (B.plain . B.text) r
+ rows = map (map (B.plain . B.text)) rs
+ maximum' [] = 0
+ maximum' xs = maximum xs
+ aligns = replicate numcols AlignDefault
+ widths = replicate numcols 0
+ Right [] -> return $ B.doc mempty
+ Left e -> throwError $ PandocParsecError s e
+
+{- from RFC 4180
+
+ The ABNF grammar [2] appears as follows:
+
+ file = [header CRLF] record *(CRLF record) [CRLF]
+
+ header = name *(COMMA name)
+
+ record = field *(COMMA field)
+
+ name = field
+
+ field = (escaped / non-escaped)
+
+ escaped = DQUOTE *(TEXTDATA / COMMA / CR / LF / 2DQUOTE) DQUOTE
+
+ non-escaped = *TEXTDATA
+
+ COMMA = %x2C
+
+ CR = %x0D ;as per section 6.1 of RFC 2234 [2]
+
+ DQUOTE = %x22 ;as per section 6.1 of RFC 2234 [2]
+
+ LF = %x0A ;as per section 6.1 of RFC 2234 [2]
+
+ CRLF = CR LF ;as per section 6.1 of RFC 2234 [2]
+
+ TEXTDATA = %x20-21 / %x23-2B / %x2D-7E
+-}
+
+pCSV :: Parser [[Text]]
+pCSV = many pRecord
+
+pRecord :: Parser [Text]
+pRecord = do
+ x <- pField
+ xs <- many $ pComma >> pField
+ () <$ newline <|> eof
+ return (x:xs)
+
+pField :: Parser Text
+pField = pEscaped <|> pUnescaped
+
+pComma :: Parser Char
+pComma = char ','
+
+pUnescaped :: Parser Text
+pUnescaped = T.strip . T.pack <$> many1 (noneOf "\n\r\",")
+
+pEscaped :: Parser Text
+pEscaped = do
+ char '"'
+ t <- T.pack <$> many (pDoubledQuote <|> noneOf "\"")
+ char '"'
+ return t
+
+pDoubledQuote :: Parser Char
+pDoubledQuote = try $ char '"' >> char '"'
diff --git a/test/command/csv.md b/test/command/csv.md
new file mode 100644
index 000000000..4723bc4f6
--- /dev/null
+++ b/test/command/csv.md
@@ -0,0 +1,17 @@
+```
+% pandoc -f csv -t native
+Fruit,Price,Quantity
+Apple,25 cents,33
+"""Navel"" Orange","35 cents",22
+^D
+[Table [] [AlignDefault,AlignDefault,AlignDefault] [0.0,0.0,0.0]
+ [[Plain [Str "Fruit"]]
+ ,[Plain [Str "Price"]]
+ ,[Plain [Str "Quantity"]]]
+ [[[Plain [Str "Apple"]]
+ ,[Plain [Str "25",Space,Str "cents"]]
+ ,[Plain [Str "33"]]]
+ ,[[Plain [Str "\"Navel\"",Space,Str "Orange"]]
+ ,[Plain [Str "35",Space,Str "cents"]]
+ ,[Plain [Str "22"]]]]]
+```