From e0d234e54d18a82a7c90aa3946f890140e200051 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 16 Mar 2015 22:20:42 -0700 Subject: Added CommonMark reader using cmark (libcmark bindings). - Added commonmark as an input format. - Added `Text.Pandoc.Readers.CommonMark.readCommonMark`. - For now, we use the markdown writer to generate benchmark text for the CommonMark reader. We can change this when we get a writer. --- README | 25 +++---- benchmark/benchmark-pandoc.hs | 18 ++++-- pandoc.cabal | 4 +- src/Text/Pandoc.hs | 3 + src/Text/Pandoc/Readers/CommonMark.hs | 118 ++++++++++++++++++++++++++++++++++ 5 files changed, 150 insertions(+), 18 deletions(-) create mode 100644 src/Text/Pandoc/Readers/CommonMark.hs diff --git a/README b/README index 41bf0e4db..81e3f877b 100644 --- a/README +++ b/README @@ -12,17 +12,18 @@ Description Pandoc is a [Haskell] library for converting from one markup format to another, and a command-line tool that uses this library. It can read -[markdown] and (subsets of) [Textile], [reStructuredText], [HTML], -[LaTeX], [MediaWiki markup], [TWiki markup], [Haddock markup], [OPML], -[Emacs Org-mode], [DocBook], [txt2tags], [EPUB] and [Word docx]; and -it can write plain text, [markdown], [reStructuredText], [XHTML], -[HTML 5], [LaTeX] (including [beamer] slide shows), [ConTeXt], [RTF], -[OPML], [DocBook], [OpenDocument], [ODT], [Word docx], [GNU Texinfo], -[MediaWiki markup], [DokuWiki markup], [Haddock markup], [EPUB] (v2 or v3), -[FictionBook2], [Textile], [groff man] pages, [Emacs Org-Mode], [AsciiDoc], -[InDesign ICML], and [Slidy], [Slideous], [DZSlides], [reveal.js] or -[S5] HTML slide shows. It can also produce [PDF] output on systems where -LaTeX is installed. +[Markdown], [CommonMark], and (subsets of) [Textile], +[reStructuredText], [HTML], [LaTeX], [MediaWiki markup], [TWiki +markup], [Haddock markup], [OPML], [Emacs Org-mode], [DocBook], +[txt2tags], [EPUB] and [Word docx]; and it can write plain text, +[Markdown], [reStructuredText], [XHTML], [HTML 5], [LaTeX] (including +[beamer] slide shows), [ConTeXt], [RTF], [OPML], [DocBook], +[OpenDocument], [ODT], [Word docx], [GNU Texinfo], [MediaWiki markup], +[DokuWiki markup], [Haddock markup], [EPUB] (v2 or v3), +[FictionBook2], [Textile], [groff man] pages, [Emacs Org-Mode], +[AsciiDoc], [InDesign ICML], and [Slidy], [Slideous], [DZSlides], +[reveal.js] or [S5] HTML slide shows. It can also produce [PDF] output +on systems where LaTeX is installed. Pandoc's enhanced version of markdown includes syntax for footnotes, tables, flexible ordered lists, definition lists, fenced code blocks, @@ -159,6 +160,7 @@ General options extended markdown), `markdown_strict` (original unextended markdown), `markdown_phpextra` (PHP Markdown Extra extended markdown), `markdown_github` (github extended markdown), + `commonmark` (CommonMark markdown), `textile` (Textile), `rst` (reStructuredText), `html` (HTML), `docbook` (DocBook), `t2t` (txt2tags), `docx` (docx), `epub` (EPUB), `opml` (OPML), `org` (Emacs Org-mode), `mediawiki` (MediaWiki markup), @@ -3238,3 +3240,4 @@ Rosenthal. [EPUB]: http://idpf.org/epub [EPUBspine]: http://www.idpf.org/epub/301/spec/epub-publications.html#sec-spine-elem [KaTeX]: https://github.com/Khan/KaTeX +[CommonMark]: http://commonmark.org diff --git a/benchmark/benchmark-pandoc.hs b/benchmark/benchmark-pandoc.hs index bf67eaa4d..2a34696b9 100644 --- a/benchmark/benchmark-pandoc.hs +++ b/benchmark/benchmark-pandoc.hs @@ -26,12 +26,18 @@ import Debug.Trace (trace) readerBench :: Pandoc -> (String, ReaderOptions -> String -> IO Pandoc) -> Maybe Benchmark -readerBench doc (name, reader) = case lookup name writers of - Just (PureStringWriter writer) -> - let inp = writer def{ writerWrapText = True} doc - in return $ bench (name ++ " reader") $ nfIO $ - (reader def{ readerSmart = True }) inp - _ -> trace ("\nCould not find writer for " ++ name ++ "\n") Nothing +readerBench doc (name, reader) = + case lookup name writers of + Just (PureStringWriter writer) -> + let inp = writer def{ writerWrapText = True} doc + in return $ bench (name ++ " reader") $ nfIO $ + (reader def{ readerSmart = True }) inp + _ | name == "commonmark" -> + let inp = writeMarkdown def{ writerWrapText = True} doc + in return $ bench (name ++ " reader") $ nfIO $ + (reader def{ readerSmart = True }) inp + | otherwise -> trace ("\nCould not find writer for " ++ name ++ + "\n") Nothing writerBench :: Pandoc -> (String, WriterOptions -> Pandoc -> String) diff --git a/pandoc.cabal b/pandoc.cabal index f4a8b4e69..823e92812 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -254,7 +254,8 @@ Library old-time, deepseq-generics >= 0.1 && < 0.2, JuicyPixels >= 3.1.6.1 && < 3.3, - filemanip >= 0.3 && < 0.4 + filemanip >= 0.3 && < 0.4, + cmark >= 0.3 && < 0.4 if flag(old-locale) Build-Depends: old-locale >= 1 && < 1.1, time >= 1.2 && < 1.5 @@ -292,6 +293,7 @@ Library Text.Pandoc.Readers.HTML, Text.Pandoc.Readers.LaTeX, Text.Pandoc.Readers.Markdown, + Text.Pandoc.Readers.CommonMark, Text.Pandoc.Readers.MediaWiki, Text.Pandoc.Readers.RST, Text.Pandoc.Readers.Org, diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index d2bb85699..435e60eb1 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -66,6 +66,7 @@ module Text.Pandoc , mkStringReader , readDocx , readMarkdown + , readCommonMark , readMediaWiki , readRST , readOrg @@ -124,6 +125,7 @@ import Text.Pandoc.Definition import Text.Pandoc.Generic import Text.Pandoc.JSON import Text.Pandoc.Readers.Markdown +import Text.Pandoc.Readers.CommonMark import Text.Pandoc.Readers.MediaWiki import Text.Pandoc.Readers.RST import Text.Pandoc.Readers.Org @@ -225,6 +227,7 @@ readers = [ ("native" , StringReader $ \_ s -> return $ readNative s) ,("markdown_phpextra" , mkStringReaderWithWarnings readMarkdownWithWarnings) ,("markdown_github" , mkStringReaderWithWarnings readMarkdownWithWarnings) ,("markdown_mmd", mkStringReaderWithWarnings readMarkdownWithWarnings) + ,("commonmark" , mkStringReader readCommonMark) ,("rst" , mkStringReaderWithWarnings readRSTWithWarnings ) ,("mediawiki" , mkStringReader readMediaWiki) ,("docbook" , mkStringReader readDocBook) diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs new file mode 100644 index 000000000..dfad7adc2 --- /dev/null +++ b/src/Text/Pandoc/Readers/CommonMark.hs @@ -0,0 +1,118 @@ +{- +Copyright (C) 2015 John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.CommonMark + Copyright : Copyright (C) 2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Conversion of CommonMark-formatted plain text to 'Pandoc' document. + +CommonMark is a strongly specified variant of Markdown: http://commonmark.org. +-} +module Text.Pandoc.Readers.CommonMark (readCommonMark) +where + +import CMark +import Data.Text (unpack, pack) +import Data.List (groupBy) +import Text.Pandoc.Definition +import Text.Pandoc.Options + +-- | Parse a CommonMark formatted string into a 'Pandoc' structure. +readCommonMark :: ReaderOptions -> String -> Pandoc +readCommonMark opts = nodeToPandoc . commonmarkToNode opts' . pack + where opts' = if readerSmart opts + then [optNormalize, optSmart] + else [optNormalize] + +nodeToPandoc :: Node -> Pandoc +nodeToPandoc (Node _ DOCUMENT nodes) = + Pandoc nullMeta $ foldr addBlock [] nodes +nodeToPandoc n = -- shouldn't happen + Pandoc nullMeta $ foldr addBlock [] [n] + +addBlocks :: [Node] -> [Block] +addBlocks = foldr addBlock [] + +addBlock :: Node -> [Block] -> [Block] +addBlock (Node _ PARAGRAPH nodes) = + (Para (addInlines nodes) :) +addBlock (Node _ HRULE _) = + (HorizontalRule :) +addBlock (Node _ BLOCK_QUOTE nodes) = + (BlockQuote (addBlocks nodes) :) +addBlock (Node _ (HTML t) _) = + (RawBlock (Format "html") (unpack t) :) +addBlock (Node _ (CODE_BLOCK info t) _) = + (CodeBlock ("", take 1 (words (unpack info)), []) (unpack t) :) +addBlock (Node _ (HEADER lev) nodes) = + (Header lev ("",[],[]) (addInlines nodes) :) +addBlock (Node _ (LIST listAttrs) nodes) = + (constructor (map (setTightness . addBlocks . children) nodes) :) + where constructor = case listType listAttrs of + BULLET_LIST -> BulletList + ORDERED_LIST -> OrderedList + (start, DefaultStyle, delim) + start = listStart listAttrs + setTightness = if listTight listAttrs + then map paraToPlain + else id + paraToPlain (Para xs) = Plain (xs) + paraToPlain x = x + delim = case listDelim listAttrs of + PERIOD_DELIM -> Period + PAREN_DELIM -> OneParen +addBlock (Node _ ITEM nodes) = id -- handled in LIST +addBlock _ = id + +children :: Node -> [Node] +children (Node _ _ ns) = ns + +addInlines :: [Node] -> [Inline] +addInlines = foldr addInline [] + +addInline :: Node -> [Inline] -> [Inline] +addInline (Node _ (TEXT t) _) = (map toinl clumps ++) + where raw = unpack t + clumps = groupBy samekind raw + samekind ' ' ' ' = True + samekind ' ' _ = False + samekind _ ' ' = False + samekind _ _ = True + toinl (' ':_) = Space + toinl xs = Str xs +addInline (Node _ LINEBREAK _) = (LineBreak :) +addInline (Node _ SOFTBREAK _) = (Space :) +addInline (Node _ (INLINE_HTML t) _) = + (RawInline (Format "html") (unpack t) :) +addInline (Node _ (CODE t) _) = + (Code ("",[],[]) (unpack t) :) +addInline (Node _ EMPH nodes) = + (Emph (addInlines nodes) :) +addInline (Node _ STRONG nodes) = + (Strong (addInlines nodes) :) +addInline (Node _ (LINK url title) nodes) = + (Link (addInlines nodes) (unpack url, unpack title) :) +addInline (Node _ (IMAGE url title) nodes) = + (Image (addInlines nodes) (unpack url, unpack title) :) +addInline _ = id -- cgit v1.2.3