Support ipynb (Jupyter notebook) as input and output format.

[API change] * Depend on ipynb library. * Add `ipynb` as input and output format. * Added Text.Pandoc.Readers.Ipynb (supports both nbformat v3 and v4). * Added Text.Pandoc.Writers.Ipynb (supports nbformat v4). * Added ipynb readers and writers to T.P.Readers, T.P.Writers, and T.P.Extensions. Register the file extension .ipynb for this format. * Add `PandocIpynbDecodingError` constructor to Text.Pandoc.Error.Error. * Note: there is no template for ipynb.
author: John MacFarlane <jgm@berkeley.edu> 2019-01-13 07:56:55 -0800
committer: John MacFarlane <jgm@berkeley.edu> 2019-01-22 21:45:59 -0800
commit: 395ea03069167568f8fccb018c794bd43a787b94 (patch)
tree: 6b42b2f5daec10b5f5bc5e62b42efc88f7a1053b /src/Text/Pandoc/Readers
parent: 5ddd7b121e1aea061b3e7b831dabbd13311929ff (diff)
download: pandoc-395ea03069167568f8fccb018c794bd43a787b94.tar.gz
1 files changed, 249 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/Ipynb.hs b/src/Text/Pandoc/Readers/Ipynb.hs
new file mode 100644
index 000000000..2497e6a0a
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Ipynb.hs
@@ -0,0 +1,249 @@
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE FlexibleContexts  #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-
+Copyright (C) 2019 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+-}
+
+{- |
+   Module      : Text.Pandoc.Readers.Ipynb
+   Copyright   : Copyright (C) 2019 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+
+Ipynb (Jupyter notebook JSON format) reader for pandoc.
+-}
+module Text.Pandoc.Readers.Ipynb ( readIpynb )
+where
+import Prelude
+import Data.Maybe (fromMaybe)
+import Data.Digest.Pure.SHA (sha1, showDigest)
+import Text.Pandoc.Options
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Logging
+import Text.Pandoc.Definition
+import Data.Ipynb as Ipynb
+import Text.Pandoc.Class
+import Text.Pandoc.MIME (extensionFromMimeType)
+import Text.Pandoc.UTF8
+import Text.Pandoc.Error
+import Data.Text (Text)
+import qualified Data.Map as M
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as TE
+import qualified Data.ByteString.Lazy as BL
+import Data.Aeson as Aeson
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Readers.Markdown (readMarkdown)
+import Text.Pandoc.Readers.HTML (readHtml)
+
+readIpynb :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
+readIpynb opts t = do
+  let src = BL.fromStrict (TE.encodeUtf8 t)
+  case eitherDecode src of
+    Right (notebook4 :: Notebook NbV4) -> notebookToPandoc opts notebook4
+    Left _ ->
+      case eitherDecode src of
+        Right (notebook3 :: Notebook NbV3) -> notebookToPandoc opts notebook3
+        Left err -> throwError $ PandocIpynbDecodingError err
+
+notebookToPandoc :: (PandocMonad m, FromJSON (Notebook a))
+                 => ReaderOptions -> Notebook a -> m Pandoc
+notebookToPandoc opts notebook = do
+  let cells = notebookCells notebook
+  let (fmt,fmtminor) = notebookFormat notebook
+  let m = M.insert "nbformat" (MetaString $ show fmt) $
+          M.insert "nbformat_minor" (MetaString $ show fmtminor) $
+          jsonMetaToMeta (notebookMetadata notebook)
+  let lang = case M.lookup "kernelspec" m of
+                   Just (MetaMap ks) ->
+                      case M.lookup "language" ks of
+                         Just (MetaString l) -> l
+                         _ -> "python"
+                   _ -> "python"
+  bs <- mconcat <$> mapM (cellToBlocks opts lang) cells
+  let Pandoc _ blocks = B.doc bs
+  return $ Pandoc (Meta m) blocks
+
+cellToBlocks :: PandocMonad m
+             => ReaderOptions -> String -> Cell a -> m B.Blocks
+cellToBlocks opts lang c = do
+  let Source ts = cellSource c
+  let source = mconcat ts
+  let kvs = jsonMetaToPairs (cellMetadata c)
+  let attachments = maybe mempty M.toList $ cellAttachments c
+  mapM_ addAttachment attachments
+  case cellType c of
+    Ipynb.Markdown -> do
+      Pandoc _ bs <- readMarkdown opts source
+      return $ B.divWith ("",["cell","markdown"],kvs)
+             $ B.fromList bs
+    Ipynb.Heading lev -> do
+      Pandoc _ bs <- readMarkdown opts
+        (T.replicate lev "#" <> " " <> source)
+      return $ B.divWith ("",["cell","markdown"],kvs)
+             $ B.fromList bs
+    Ipynb.Raw -> do
+      let format = fromMaybe "" $ lookup "format" kvs
+      let format' =
+            case format of
+              "text/html"       ->  "html"
+              "text/latex"      -> "latex"
+              "application/pdf" -> "latex"
+              "text/markdown"   -> "markdown"
+              "text/x-rsrt"     -> "rst"
+              _                 -> format
+      return $ B.divWith ("",["cell","raw"],kvs) $ B.rawBlock format'
+             $ T.unpack source
+    Ipynb.Code{ codeOutputs = outputs, codeExecutionCount = ec } -> do
+      outputBlocks <- mconcat <$> mapM (outputToBlock opts) outputs
+      let kvs' = maybe kvs (\x -> ("execution_count", show x):kvs) ec
+      return $ B.divWith ("",["cell","code"],kvs') $
+        B.codeBlockWith ("",[lang],[]) (T.unpack source)
+        <> outputBlocks
+
+addAttachment :: PandocMonad m => (Text, MimeBundle) -> m ()
+addAttachment (fname, mimeBundle) = do
+  let fp = T.unpack fname
+  case M.toList (unMimeBundle mimeBundle) of
+    (mimeType, BinaryData bs):_ ->
+      insertMedia fp (Just $ T.unpack mimeType) (BL.fromStrict bs)
+    (mimeType, TextualData t):_ ->
+      insertMedia fp (Just $ T.unpack mimeType)
+          (BL.fromStrict $ TE.encodeUtf8 t)
+    (mimeType, JsonData v):_ ->
+      insertMedia fp (Just $ T.unpack mimeType) (encode v)
+    [] -> report $ CouldNotFetchResource fp "no attachment"
+
+outputToBlock :: PandocMonad m => ReaderOptions -> Output a -> m B.Blocks
+outputToBlock _ Stream{ streamName = sName,
+                        streamText = Source text } = do
+  return $ B.divWith ("",["output","stream",T.unpack sName],[])
+         $ B.codeBlock $ T.unpack . mconcat $ text
+outputToBlock opts DisplayData{ displayData = data',
+                                 displayMetadata = metadata' } =
+  B.divWith ("",["output", "display_data"],[]) <$>
+    handleData opts metadata' data'
+outputToBlock opts ExecuteResult{ executeCount = ec,
+                                   executeData = data',
+                                   executeMetadata = metadata' } =
+  B.divWith ("",["output", "execute_result"],[("execution_count",show ec)])
+    <$> handleData opts metadata' data'
+outputToBlock _ Err{ errName = ename,
+                     errValue = evalue,
+                     errTraceback = traceback } = do
+  return $ B.divWith ("",["output","error"],
+                         [("ename",T.unpack ename),
+                          ("evalue",T.unpack evalue)])
+         $ B.codeBlock $ T.unpack . T.unlines $ traceback
+
+-- We want to display the richest output possible given
+-- the output format.
+handleData :: PandocMonad m
+           => ReaderOptions -> JSONMeta -> MimeBundle -> m B.Blocks
+handleData opts metadata (MimeBundle mb) = do
+  let mimePairs = M.toList mb
+
+  results <- mapM dataBlock mimePairs
+
+  -- return the result with highest priority:
+
+  let highest = maximum (0 : map fst results)
+  return $ case [r | (pr, r) <- results, pr == highest] of
+             x:_  -> x
+             []   -> mempty
+
+  where
+
+    exts = readerExtensions opts
+
+    dataBlock :: PandocMonad m => (MimeType, MimeData) -> m (Int, B.Blocks)
+    dataBlock (mt, BinaryData bs)
+     | "image/" `T.isPrefixOf` mt
+      = do
+      -- normally metadata maps from mime types to key-value map;
+      -- but not always...
+      let meta = case M.lookup mt metadata of
+                   Just v@(Object{}) ->
+                     case fromJSON v of
+                       Success m' -> m'
+                       Error _   -> mempty
+                   _ -> mempty
+      let metaPairs = jsonMetaToPairs meta
+      let bl = BL.fromStrict bs
+      -- SHA1 hash for filename
+      let mt' = T.unpack mt
+      let fname = showDigest (sha1 bl) ++
+            case extensionFromMimeType mt' of
+              Nothing  -> ""
+              Just ext -> '.':ext
+      insertMedia fname (Just mt') bl
+      return (3, B.para $ B.imageWith ("",[],metaPairs) fname "" mempty)
+
+    dataBlock (_, BinaryData _) = return (0, mempty)
+
+    dataBlock ("text/html", TextualData t)
+      | extensionEnabled Ext_raw_html exts
+        = return (2, B.rawBlock "html" $ T.unpack t)
+      | otherwise = do -- try parsing the HTML
+          Pandoc _ bls <- readHtml opts t
+          return (1, B.fromList bls)
+
+    dataBlock ("text/latex", TextualData t) =
+      return $ if extensionEnabled Ext_raw_tex exts
+                  then (2, B.rawBlock "latex" $ T.unpack t)
+                  else (0, mempty)
+
+    dataBlock ("text/plain", TextualData t) =
+      return (0, B.codeBlock $ T.unpack t)
+
+    dataBlock (_, JsonData v) =
+      return (2, B.codeBlockWith ("",["json"],[]) $ toStringLazy $ encode v)
+
+    dataBlock _ = return (0, mempty)
+
+jsonMetaToMeta :: JSONMeta -> M.Map String MetaValue
+jsonMetaToMeta = M.mapKeys T.unpack . M.map valueToMetaValue
+  where
+    valueToMetaValue :: Value -> MetaValue
+    valueToMetaValue x@(Object{}) =
+      case fromJSON x of
+        Error s -> MetaString s
+        Success jm' -> MetaMap $ jsonMetaToMeta jm'
+    valueToMetaValue x@(Array{}) =
+      case fromJSON x of
+        Error s -> MetaString s
+        Success xs -> MetaList $ map valueToMetaValue xs
+    valueToMetaValue (Bool b) = MetaBool b
+    valueToMetaValue (String t) = MetaString (T.unpack t)
+    valueToMetaValue (Number n) = MetaString (show n)
+    valueToMetaValue Aeson.Null = MetaString ""
+
+jsonMetaToPairs :: JSONMeta -> [(String, String)]
+jsonMetaToPairs = M.toList . M.mapMaybe
+     (\case
+        MetaString s -> Just s
+        MetaBool True -> Just "true"
+        MetaBool False -> Just "false"
+        -- for now we skip complex cell metadata:
+        _ -> Nothing) . jsonMetaToMeta
author	John MacFarlane <jgm@berkeley.edu>	2019-01-13 07:56:55 -0800
committer	John MacFarlane <jgm@berkeley.edu>	2019-01-22 21:45:59 -0800
commit	395ea03069167568f8fccb018c794bd43a787b94 (patch)
tree	6b42b2f5daec10b5f5bc5e62b42efc88f7a1053b /src/Text/Pandoc/Readers
parent	5ddd7b121e1aea061b3e7b831dabbd13311929ff (diff)
download	pandoc-395ea03069167568f8fccb018c794bd43a787b94.tar.gz