diff options
author | John MacFarlane <jgm@berkeley.edu> | 2020-01-31 21:14:21 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2020-01-31 21:14:21 -0800 |
commit | f9514ccb9e6efbd33ad84fcc1dfef20603affc4c (patch) | |
tree | 4da927d6dd9e253d02efb08f30b211d865f33d86 /src/Text/Pandoc | |
parent | cef30e738460c177e74df39166b351bec75857ea (diff) | |
download | pandoc-f9514ccb9e6efbd33ad84fcc1dfef20603affc4c.tar.gz |
Add Text.Pandoc.Readers.CSV (readCSV).
This adds csv as an input format.
The CSV table is converted into a pandoc simple table.
Closes #6100.
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r-- | src/Text/Pandoc/App/FormatHeuristics.hs | 1 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/CSV.hs | 108 |
3 files changed, 112 insertions, 0 deletions
diff --git a/src/Text/Pandoc/App/FormatHeuristics.hs b/src/Text/Pandoc/App/FormatHeuristics.hs index 25e0a303e..21de09f3d 100644 --- a/src/Text/Pandoc/App/FormatHeuristics.hs +++ b/src/Text/Pandoc/App/FormatHeuristics.hs @@ -75,5 +75,6 @@ formatFromFilePath x = ".wiki" -> Just "mediawiki" ".xhtml" -> Just "html" ".ipynb" -> Just "ipynb" + ".csv" -> Just "csv" ['.',y] | y `elem` ['1'..'9'] -> Just "man" _ -> Nothing diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index 990e78f35..28c7c6810 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -51,6 +51,7 @@ module Text.Pandoc.Readers , readMuse , readFB2 , readIpynb + , readCSV -- * Miscellaneous , getReader , getDefaultExtensions @@ -95,6 +96,7 @@ import Text.Pandoc.Readers.TWiki import Text.Pandoc.Readers.Txt2Tags import Text.Pandoc.Readers.Vimwiki import Text.Pandoc.Readers.Man +import Text.Pandoc.Readers.CSV import qualified Text.Pandoc.UTF8 as UTF8 import Text.Parsec.Error @@ -136,6 +138,7 @@ readers = [ ("native" , TextReader readNative) ,("man" , TextReader readMan) ,("fb2" , TextReader readFB2) ,("ipynb" , TextReader readIpynb) + ,("csv" , TextReader readCSV) ] -- | Retrieve reader, extensions based on formatSpec (format+extensions). diff --git a/src/Text/Pandoc/Readers/CSV.hs b/src/Text/Pandoc/Readers/CSV.hs new file mode 100644 index 000000000..3eea9ccbd --- /dev/null +++ b/src/Text/Pandoc/Readers/CSV.hs @@ -0,0 +1,108 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE ViewPatterns #-} +{- | + Module : Text.Pandoc.Readers.RST + Copyright : Copyright (C) 2006-2019 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion from CSV to a 'Pandoc' table. +-} +module Text.Pandoc.Readers.CSV ( readCSV ) where +import Prelude +import Data.Text (Text) +import qualified Data.Text as T +import Text.Pandoc.Definition +import qualified Text.Pandoc.Builder as B +import Text.Parsec +import Text.Parsec.Text (Parser) +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Shared (crFilter) +import Text.Pandoc.Error +import Text.Pandoc.Options (ReaderOptions(..)) +import Control.Monad.Except (throwError) + +readCSV :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> Text -- ^ Text to parse (assuming @'\n'@ line endings) + -> m Pandoc +readCSV opts s = do + let columns = readerColumns opts + case parse pCSV "input" (crFilter s) of + Right (r:rs) -> return $ B.doc $ B.table capt (zip aligns widths) hdrs rows + where capt = mempty + numcols = length r + hdrs = map (B.plain . B.text) r + rows = map (map (B.plain . B.text)) rs + maximum' [] = 0 + maximum' xs = maximum xs + aligns = replicate numcols AlignDefault + widths = replicate numcols 0 + Right [] -> return $ B.doc mempty + Left e -> throwError $ PandocParsecError s e + +{- from RFC 4180 + + The ABNF grammar [2] appears as follows: + + file = [header CRLF] record *(CRLF record) [CRLF] + + header = name *(COMMA name) + + record = field *(COMMA field) + + name = field + + field = (escaped / non-escaped) + + escaped = DQUOTE *(TEXTDATA / COMMA / CR / LF / 2DQUOTE) DQUOTE + + non-escaped = *TEXTDATA + + COMMA = %x2C + + CR = %x0D ;as per section 6.1 of RFC 2234 [2] + + DQUOTE = %x22 ;as per section 6.1 of RFC 2234 [2] + + LF = %x0A ;as per section 6.1 of RFC 2234 [2] + + CRLF = CR LF ;as per section 6.1 of RFC 2234 [2] + + TEXTDATA = %x20-21 / %x23-2B / %x2D-7E +-} + +pCSV :: Parser [[Text]] +pCSV = many pRecord + +pRecord :: Parser [Text] +pRecord = do + x <- pField + xs <- many $ pComma >> pField + () <$ newline <|> eof + return (x:xs) + +pField :: Parser Text +pField = pEscaped <|> pUnescaped + +pComma :: Parser Char +pComma = char ',' + +pUnescaped :: Parser Text +pUnescaped = T.strip . T.pack <$> many1 (noneOf "\n\r\",") + +pEscaped :: Parser Text +pEscaped = do + char '"' + t <- T.pack <$> many (pDoubledQuote <|> noneOf "\"") + char '"' + return t + +pDoubledQuote :: Parser Char +pDoubledQuote = try $ char '"' >> char '"' |