diff options
author | John MacFarlane <jgm@berkeley.edu> | 2019-01-13 07:56:55 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2019-01-22 21:45:59 -0800 |
commit | 395ea03069167568f8fccb018c794bd43a787b94 (patch) | |
tree | 6b42b2f5daec10b5f5bc5e62b42efc88f7a1053b /src/Text/Pandoc/Readers | |
parent | 5ddd7b121e1aea061b3e7b831dabbd13311929ff (diff) | |
download | pandoc-395ea03069167568f8fccb018c794bd43a787b94.tar.gz |
Support ipynb (Jupyter notebook) as input and output format.
[API change]
* Depend on ipynb library.
* Add `ipynb` as input and output format.
* Added Text.Pandoc.Readers.Ipynb (supports both nbformat v3 and v4).
* Added Text.Pandoc.Writers.Ipynb (supports nbformat v4).
* Added ipynb readers and writers to T.P.Readers,
T.P.Writers, and T.P.Extensions. Register the
file extension .ipynb for this format.
* Add `PandocIpynbDecodingError` constructor to Text.Pandoc.Error.Error.
* Note: there is no template for ipynb.
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/Ipynb.hs | 249 |
1 files changed, 249 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/Ipynb.hs b/src/Text/Pandoc/Readers/Ipynb.hs new file mode 100644 index 000000000..2497e6a0a --- /dev/null +++ b/src/Text/Pandoc/Readers/Ipynb.hs @@ -0,0 +1,249 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE ScopedTypeVariables #-} +{- +Copyright (C) 2019 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Ipynb + Copyright : Copyright (C) 2019 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Ipynb (Jupyter notebook JSON format) reader for pandoc. +-} +module Text.Pandoc.Readers.Ipynb ( readIpynb ) +where +import Prelude +import Data.Maybe (fromMaybe) +import Data.Digest.Pure.SHA (sha1, showDigest) +import Text.Pandoc.Options +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Logging +import Text.Pandoc.Definition +import Data.Ipynb as Ipynb +import Text.Pandoc.Class +import Text.Pandoc.MIME (extensionFromMimeType) +import Text.Pandoc.UTF8 +import Text.Pandoc.Error +import Data.Text (Text) +import qualified Data.Map as M +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE +import qualified Data.ByteString.Lazy as BL +import Data.Aeson as Aeson +import Control.Monad.Except (throwError) +import Text.Pandoc.Readers.Markdown (readMarkdown) +import Text.Pandoc.Readers.HTML (readHtml) + +readIpynb :: PandocMonad m => ReaderOptions -> Text -> m Pandoc +readIpynb opts t = do + let src = BL.fromStrict (TE.encodeUtf8 t) + case eitherDecode src of + Right (notebook4 :: Notebook NbV4) -> notebookToPandoc opts notebook4 + Left _ -> + case eitherDecode src of + Right (notebook3 :: Notebook NbV3) -> notebookToPandoc opts notebook3 + Left err -> throwError $ PandocIpynbDecodingError err + +notebookToPandoc :: (PandocMonad m, FromJSON (Notebook a)) + => ReaderOptions -> Notebook a -> m Pandoc +notebookToPandoc opts notebook = do + let cells = notebookCells notebook + let (fmt,fmtminor) = notebookFormat notebook + let m = M.insert "nbformat" (MetaString $ show fmt) $ + M.insert "nbformat_minor" (MetaString $ show fmtminor) $ + jsonMetaToMeta (notebookMetadata notebook) + let lang = case M.lookup "kernelspec" m of + Just (MetaMap ks) -> + case M.lookup "language" ks of + Just (MetaString l) -> l + _ -> "python" + _ -> "python" + bs <- mconcat <$> mapM (cellToBlocks opts lang) cells + let Pandoc _ blocks = B.doc bs + return $ Pandoc (Meta m) blocks + +cellToBlocks :: PandocMonad m + => ReaderOptions -> String -> Cell a -> m B.Blocks +cellToBlocks opts lang c = do + let Source ts = cellSource c + let source = mconcat ts + let kvs = jsonMetaToPairs (cellMetadata c) + let attachments = maybe mempty M.toList $ cellAttachments c + mapM_ addAttachment attachments + case cellType c of + Ipynb.Markdown -> do + Pandoc _ bs <- readMarkdown opts source + return $ B.divWith ("",["cell","markdown"],kvs) + $ B.fromList bs + Ipynb.Heading lev -> do + Pandoc _ bs <- readMarkdown opts + (T.replicate lev "#" <> " " <> source) + return $ B.divWith ("",["cell","markdown"],kvs) + $ B.fromList bs + Ipynb.Raw -> do + let format = fromMaybe "" $ lookup "format" kvs + let format' = + case format of + "text/html" -> "html" + "text/latex" -> "latex" + "application/pdf" -> "latex" + "text/markdown" -> "markdown" + "text/x-rsrt" -> "rst" + _ -> format + return $ B.divWith ("",["cell","raw"],kvs) $ B.rawBlock format' + $ T.unpack source + Ipynb.Code{ codeOutputs = outputs, codeExecutionCount = ec } -> do + outputBlocks <- mconcat <$> mapM (outputToBlock opts) outputs + let kvs' = maybe kvs (\x -> ("execution_count", show x):kvs) ec + return $ B.divWith ("",["cell","code"],kvs') $ + B.codeBlockWith ("",[lang],[]) (T.unpack source) + <> outputBlocks + +addAttachment :: PandocMonad m => (Text, MimeBundle) -> m () +addAttachment (fname, mimeBundle) = do + let fp = T.unpack fname + case M.toList (unMimeBundle mimeBundle) of + (mimeType, BinaryData bs):_ -> + insertMedia fp (Just $ T.unpack mimeType) (BL.fromStrict bs) + (mimeType, TextualData t):_ -> + insertMedia fp (Just $ T.unpack mimeType) + (BL.fromStrict $ TE.encodeUtf8 t) + (mimeType, JsonData v):_ -> + insertMedia fp (Just $ T.unpack mimeType) (encode v) + [] -> report $ CouldNotFetchResource fp "no attachment" + +outputToBlock :: PandocMonad m => ReaderOptions -> Output a -> m B.Blocks +outputToBlock _ Stream{ streamName = sName, + streamText = Source text } = do + return $ B.divWith ("",["output","stream",T.unpack sName],[]) + $ B.codeBlock $ T.unpack . mconcat $ text +outputToBlock opts DisplayData{ displayData = data', + displayMetadata = metadata' } = + B.divWith ("",["output", "display_data"],[]) <$> + handleData opts metadata' data' +outputToBlock opts ExecuteResult{ executeCount = ec, + executeData = data', + executeMetadata = metadata' } = + B.divWith ("",["output", "execute_result"],[("execution_count",show ec)]) + <$> handleData opts metadata' data' +outputToBlock _ Err{ errName = ename, + errValue = evalue, + errTraceback = traceback } = do + return $ B.divWith ("",["output","error"], + [("ename",T.unpack ename), + ("evalue",T.unpack evalue)]) + $ B.codeBlock $ T.unpack . T.unlines $ traceback + +-- We want to display the richest output possible given +-- the output format. +handleData :: PandocMonad m + => ReaderOptions -> JSONMeta -> MimeBundle -> m B.Blocks +handleData opts metadata (MimeBundle mb) = do + let mimePairs = M.toList mb + + results <- mapM dataBlock mimePairs + + -- return the result with highest priority: + + let highest = maximum (0 : map fst results) + return $ case [r | (pr, r) <- results, pr == highest] of + x:_ -> x + [] -> mempty + + where + + exts = readerExtensions opts + + dataBlock :: PandocMonad m => (MimeType, MimeData) -> m (Int, B.Blocks) + dataBlock (mt, BinaryData bs) + | "image/" `T.isPrefixOf` mt + = do + -- normally metadata maps from mime types to key-value map; + -- but not always... + let meta = case M.lookup mt metadata of + Just v@(Object{}) -> + case fromJSON v of + Success m' -> m' + Error _ -> mempty + _ -> mempty + let metaPairs = jsonMetaToPairs meta + let bl = BL.fromStrict bs + -- SHA1 hash for filename + let mt' = T.unpack mt + let fname = showDigest (sha1 bl) ++ + case extensionFromMimeType mt' of + Nothing -> "" + Just ext -> '.':ext + insertMedia fname (Just mt') bl + return (3, B.para $ B.imageWith ("",[],metaPairs) fname "" mempty) + + dataBlock (_, BinaryData _) = return (0, mempty) + + dataBlock ("text/html", TextualData t) + | extensionEnabled Ext_raw_html exts + = return (2, B.rawBlock "html" $ T.unpack t) + | otherwise = do -- try parsing the HTML + Pandoc _ bls <- readHtml opts t + return (1, B.fromList bls) + + dataBlock ("text/latex", TextualData t) = + return $ if extensionEnabled Ext_raw_tex exts + then (2, B.rawBlock "latex" $ T.unpack t) + else (0, mempty) + + dataBlock ("text/plain", TextualData t) = + return (0, B.codeBlock $ T.unpack t) + + dataBlock (_, JsonData v) = + return (2, B.codeBlockWith ("",["json"],[]) $ toStringLazy $ encode v) + + dataBlock _ = return (0, mempty) + +jsonMetaToMeta :: JSONMeta -> M.Map String MetaValue +jsonMetaToMeta = M.mapKeys T.unpack . M.map valueToMetaValue + where + valueToMetaValue :: Value -> MetaValue + valueToMetaValue x@(Object{}) = + case fromJSON x of + Error s -> MetaString s + Success jm' -> MetaMap $ jsonMetaToMeta jm' + valueToMetaValue x@(Array{}) = + case fromJSON x of + Error s -> MetaString s + Success xs -> MetaList $ map valueToMetaValue xs + valueToMetaValue (Bool b) = MetaBool b + valueToMetaValue (String t) = MetaString (T.unpack t) + valueToMetaValue (Number n) = MetaString (show n) + valueToMetaValue Aeson.Null = MetaString "" + +jsonMetaToPairs :: JSONMeta -> [(String, String)] +jsonMetaToPairs = M.toList . M.mapMaybe + (\case + MetaString s -> Just s + MetaBool True -> Just "true" + MetaBool False -> Just "false" + -- for now we skip complex cell metadata: + _ -> Nothing) . jsonMetaToMeta |