Added gfm (GitHub-flavored CommonMark) as an input and output format.

This uses bindings to GitHub's fork of cmark, so it should parse gfm exactly as GitHub does (excepting certain postprocessing steps, involving notifications, emojis, etc.). * Added Text.Pandoc.Readers.GFM (exporting readGFM) * Added Text.Pandoc.Writers.GFM (exporting writeGFM) * Added `gfm` as input and output forma Note that tables are currently always rendered as HTML in the writer; this can be improved when CMarkGFM supports tables in output.
author: John MacFarlane <jgm@berkeley.edu> 2017-08-03 10:50:05 -0700
committer: John MacFarlane <jgm@berkeley.edu> 2017-08-07 16:59:31 -0700
commit: 2c81c4c218be757ff2610e3b47548f0642ce5c10 (patch)
tree: 888463a66a9af9bf56f9f635f2b0047080a4dbd3 /src/Text/Pandoc/Readers
parent: 8995281691b56d711c44fb0c5cae2fc675d12eb9 (diff)
download: pandoc-2c81c4c218be757ff2610e3b47548f0642ce5c10.tar.gz
2 files changed, 187 insertions, 2 deletions
diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs
index 3c62f8db5..a0ea9f3e8 100644
--- a/src/Text/Pandoc/Readers/CommonMark.hs
+++ b/src/Text/Pandoc/Readers/CommonMark.hs
@@ -44,8 +44,8 @@ readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
 readCommonMark opts s = return $
   nodeToPandoc $ commonmarkToNode opts' s
   where opts' = if extensionEnabled Ext_smart (readerExtensions opts)
-                   then [optNormalize, optSmart]
-                   else [optNormalize]
+                   then [optSmart]
+                   else []
 
 nodeToPandoc :: Node -> Pandoc
 nodeToPandoc (Node _ DOCUMENT nodes) =
diff --git a/src/Text/Pandoc/Readers/GFM.hs b/src/Text/Pandoc/Readers/GFM.hs
new file mode 100644
index 000000000..1cdc47b98
--- /dev/null
+++ b/src/Text/Pandoc/Readers/GFM.hs
@@ -0,0 +1,185 @@
+{-
+Copyright (C) 2017 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+-}
+
+{- |
+   Module      : Text.Pandoc.Readers.GFM
+   Copyright   : Copyright (C) 2017 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+
+Conversion of GitHub flavored CommonMark text to 'Pandoc' document.
+
+CommonMark is a strongly specified variant of Markdown: http://commonmark.org.
+-}
+module Text.Pandoc.Readers.GFM (readGFM)
+where
+
+import CMarkGFM
+import Data.List (groupBy)
+import Data.Text (Text, unpack)
+import Text.Pandoc.Class (PandocMonad)
+import Text.Pandoc.Definition
+import Text.Pandoc.Options
+
+-- | Parse a CommonMark formatted string into a 'Pandoc' structure.
+readGFM :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
+readGFM opts s = return $
+  nodeToPandoc $ commonmarkToNode opts' exts s
+  where opts' = [optSmart | enabled Ext_smart]
+        exts = [extStrikethrough, extTable, extAutolink]
+        enabled x = extensionEnabled x (readerExtensions opts)
+
+nodeToPandoc :: Node -> Pandoc
+nodeToPandoc (Node _ DOCUMENT nodes) =
+  Pandoc nullMeta $ foldr addBlock [] nodes
+nodeToPandoc n =  -- shouldn't happen
+  Pandoc nullMeta $ foldr addBlock [] [n]
+
+addBlocks :: [Node] -> [Block]
+addBlocks = foldr addBlock []
+
+addBlock :: Node -> [Block] -> [Block]
+addBlock (Node _ PARAGRAPH nodes) =
+  (Para (addInlines nodes) :)
+addBlock (Node _ THEMATIC_BREAK _) =
+  (HorizontalRule :)
+addBlock (Node _ BLOCK_QUOTE nodes) =
+  (BlockQuote (addBlocks nodes) :)
+addBlock (Node _ (HTML_BLOCK t) _) =
+  (RawBlock (Format "html") (unpack t) :)
+-- Note:  the cmark parser will never generate CUSTOM_BLOCK,
+-- so we don't need to handle it:
+addBlock (Node _ (CUSTOM_BLOCK _onEnter _onExit) _nodes) =
+  id
+addBlock (Node _ (CODE_BLOCK info t) _) =
+  (CodeBlock ("", take 1 (words (unpack info)), []) (unpack t) :)
+addBlock (Node _ (HEADING lev) nodes) =
+  (Header lev ("",[],[]) (addInlines nodes) :)
+addBlock (Node _ (LIST listAttrs) nodes) =
+  (constructor (map (setTightness . addBlocks . children) nodes) :)
+  where constructor = case listType listAttrs of
+                       BULLET_LIST  -> BulletList
+                       ORDERED_LIST -> OrderedList
+                                         (start, DefaultStyle, delim)
+        start = listStart listAttrs
+        setTightness = if listTight listAttrs
+                           then map paraToPlain
+                           else id
+        paraToPlain (Para xs) = Plain (xs)
+        paraToPlain x         = x
+        delim = case listDelim listAttrs of
+                     PERIOD_DELIM -> Period
+                     PAREN_DELIM  -> OneParen
+addBlock (Node _ (TABLE alignments) nodes) = do
+  (Table [] aligns widths headers rows :)
+  where aligns = map fromTableCellAlignment alignments
+        fromTableCellAlignment NoAlignment   = AlignDefault
+        fromTableCellAlignment LeftAligned   = AlignLeft
+        fromTableCellAlignment RightAligned  = AlignRight
+        fromTableCellAlignment CenterAligned = AlignCenter
+        widths = replicate numcols 0.0
+        numcols = if null rows'
+                     then 0
+                     else maximum $ map length rows'
+        rows' = map toRow $ filter isRow nodes
+        (headers, rows) = case rows' of
+                               (h:rs) -> (h, rs)
+                               []     -> ([], [])
+        isRow (Node _ TABLE_ROW _) = True
+        isRow _ = False
+        isCell (Node _ TABLE_CELL _) = True
+        isCell _ = False
+        toRow (Node _ TABLE_ROW ns) = map toCell $ filter isCell ns
+        toRow (Node _ t _) = error $ "toRow encountered non-row " ++ show t
+        toCell (Node _ TABLE_CELL []) = []
+        toCell (Node _ TABLE_CELL (n:ns))
+          | isBlockNode n = addBlocks (n:ns)
+          | otherwise     = [Plain (addInlines (n:ns))]
+        toCell (Node _ t _) = error $ "toCell encountered non-cell " ++ show t
+addBlock (Node _ TABLE_ROW _) = id -- handled in TABLE
+addBlock (Node _ TABLE_CELL _) = id -- handled in TABLE
+addBlock _ = id
+
+isBlockNode :: Node -> Bool
+isBlockNode (Node _ nodetype _) =
+  case nodetype of
+       DOCUMENT -> True
+       THEMATIC_BREAK -> True
+       PARAGRAPH -> True
+       BLOCK_QUOTE -> True
+       HTML_BLOCK _ -> True
+       CUSTOM_BLOCK _ _ -> True
+       CODE_BLOCK _ _ -> True
+       HEADING _ -> True
+       LIST _ -> True
+       ITEM -> True
+       TEXT _ -> False
+       SOFTBREAK -> False
+       LINEBREAK -> False
+       HTML_INLINE _ -> False
+       CUSTOM_INLINE _ _ -> False
+       CODE _ -> False
+       EMPH -> False
+       STRONG -> False
+       LINK _ _ -> False
+       IMAGE _ _ -> False
+       STRIKETHROUGH -> False
+       TABLE _ -> False
+       TABLE_ROW -> False
+       TABLE_CELL -> False
+
+children :: Node -> [Node]
+children (Node _ _ ns) = ns
+
+addInlines :: [Node] -> [Inline]
+addInlines = foldr addInline []
+
+addInline :: Node -> [Inline] -> [Inline]
+addInline (Node _ (TEXT t) _) = (map toinl clumps ++)
+  where raw = unpack t
+        clumps = groupBy samekind raw
+        samekind ' ' ' ' = True
+        samekind ' ' _   = False
+        samekind _   ' ' = False
+        samekind _  _    = True
+        toinl (' ':_) = Space
+        toinl xs      = Str xs
+addInline (Node _ LINEBREAK _) = (LineBreak :)
+addInline (Node _ SOFTBREAK _) = (SoftBreak :)
+addInline (Node _ (HTML_INLINE t) _) =
+  (RawInline (Format "html") (unpack t) :)
+-- Note:  the cmark parser will never generate CUSTOM_BLOCK,
+-- so we don't need to handle it:
+addInline (Node _ (CUSTOM_INLINE _onEnter _onExit) _nodes) =
+  id
+addInline (Node _ (CODE t) _) =
+  (Code ("",[],[]) (unpack t) :)
+addInline (Node _ EMPH nodes) =
+  (Emph (addInlines nodes) :)
+addInline (Node _ STRONG nodes) =
+  (Strong (addInlines nodes) :)
+addInline (Node _ STRIKETHROUGH nodes) =
+  (Strikeout (addInlines nodes) :)
+addInline (Node _ (LINK url title) nodes) =
+  (Link nullAttr (addInlines nodes) (unpack url, unpack title) :)
+addInline (Node _ (IMAGE url title) nodes) =
+  (Image nullAttr (addInlines nodes) (unpack url, unpack title) :)
+addInline _ = id
author	John MacFarlane <jgm@berkeley.edu>	2017-08-03 10:50:05 -0700
committer	John MacFarlane <jgm@berkeley.edu>	2017-08-07 16:59:31 -0700
commit	2c81c4c218be757ff2610e3b47548f0642ce5c10 (patch)
tree	888463a66a9af9bf56f9f635f2b0047080a4dbd3 /src/Text/Pandoc/Readers
parent	8995281691b56d711c44fb0c5cae2fc675d12eb9 (diff)
download	pandoc-2c81c4c218be757ff2610e3b47548f0642ce5c10.tar.gz