aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2020-03-31 23:23:27 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2020-07-19 22:51:59 -0700
commit0db4702042344e3b670a53480bb0ab5b41eb9aeb (patch)
tree18330e66d27606b276a12d16cfc82136f61879a2 /src
parentc2cc4a050476ad2680ab23be5da4a0b1b12788cc (diff)
downloadpandoc-0db4702042344e3b670a53480bb0ab5b41eb9aeb.tar.gz
Use commonmark-hs to parse commonmark/gfm...
...instead of cmark-gfm (a wrapper around a C library). We can now support many more pandoc extensions for commonmark and gfm. Add fenced_code_attributes to gfm/commonmark extensions.
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Extensions.hs14
-rw-r--r--src/Text/Pandoc/Readers/CommonMark.hs241
2 files changed, 57 insertions, 198 deletions
diff --git a/src/Text/Pandoc/Extensions.hs b/src/Text/Pandoc/Extensions.hs
index e33a59e37..6594b81a1 100644
--- a/src/Text/Pandoc/Extensions.hs
+++ b/src/Text/Pandoc/Extensions.hs
@@ -438,6 +438,20 @@ getAllExtensions f = universalExtensions <> getAll f
, Ext_implicit_figures
, Ext_hard_line_breaks
, Ext_smart
+ , Ext_tex_math_dollars
+ , Ext_superscript
+ , Ext_subscript
+ , Ext_definition_lists
+ , Ext_footnotes
+ , Ext_fancy_lists
+ , Ext_fenced_divs
+ , Ext_bracketed_spans
+ , Ext_raw_attribute
+ , Ext_inline_code_attributes
+ , Ext_fenced_code_attributes
+ , Ext_link_attributes
+ , Ext_header_attributes
+ , Ext_implicit_header_references
]
getAll "commonmark" = getAll "gfm"
getAll "org" = autoIdExtensions <>
diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs
index c63ea6392..d2bde1d2a 100644
--- a/src/Text/Pandoc/Readers/CommonMark.hs
+++ b/src/Text/Pandoc/Readers/CommonMark.hs
@@ -1,4 +1,5 @@
{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
{- |
Module : Text.Pandoc.Readers.CommonMark
@@ -16,209 +17,53 @@ CommonMark is a strongly specified variant of Markdown: http://commonmark.org.
module Text.Pandoc.Readers.CommonMark (readCommonMark)
where
-import CMarkGFM
-import Control.Monad.State
-import qualified Data.Set as Set
+import Commonmark
+import Commonmark.Extensions
+import Commonmark.Pandoc
import Data.Text (Text)
-import qualified Data.Text as T
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import Text.Pandoc.Definition
-import Text.Pandoc.Emoji (emojiToInline)
+import Text.Pandoc.Builder as B
import Text.Pandoc.Options
-import Text.Pandoc.Shared (uniqueIdent, taskListItemFromAscii)
-import Text.Pandoc.Walk (walkM)
+import Text.Pandoc.Error
+import Control.Monad.Except
+import Data.Functor.Identity (runIdentity)
-- | Parse a CommonMark formatted string into a 'Pandoc' structure.
readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
-readCommonMark opts s = return $
- (if isEnabled Ext_auto_identifiers opts
- then addHeaderIdentifiers opts
- else id) $
- nodeToPandoc opts $ commonmarkToNode opts' exts s
- where opts' = [ optSmart | isEnabled Ext_smart opts ]
- exts = [ extStrikethrough | isEnabled Ext_strikeout opts ] ++
- [ extTable | isEnabled Ext_pipe_tables opts ] ++
- [ extAutolink | isEnabled Ext_autolink_bare_uris opts ]
+readCommonMark opts s = do
+ let res = runIdentity $
+ commonmarkWith (foldr (<>) defaultSyntaxSpec exts) "input" s
+ case res of
+ Left err -> throwError $ PandocParsecError s err
+ Right (Cm bls :: Cm () Blocks) -> return $ B.doc bls
+ where
+ exts = [ hardLineBreaksSpec | isEnabled Ext_hard_line_breaks opts ] ++
+ [ smartPunctuationSpec | isEnabled Ext_smart opts ] ++
+ [ strikethroughSpec | isEnabled Ext_strikeout opts ] ++
+ [ superscriptSpec | isEnabled Ext_superscript opts ] ++
+ [ subscriptSpec | isEnabled Ext_subscript opts ] ++
+ [ mathSpec | isEnabled Ext_tex_math_dollars opts ] ++
+ [ fancyListSpec | isEnabled Ext_fancy_lists opts ] ++
+ [ fencedDivSpec | isEnabled Ext_fenced_divs opts ] ++
+ [ bracketedSpanSpec | isEnabled Ext_bracketed_spans opts ] ++
+ [ rawAttributeSpec | isEnabled Ext_raw_attribute opts ] ++
+ [ attributesSpec | isEnabled Ext_link_attributes opts ||
+ isEnabled Ext_inline_code_attributes opts ||
+ isEnabled Ext_fenced_code_attributes opts ||
+ isEnabled Ext_header_attributes opts ] ++
+ [ pipeTableSpec | isEnabled Ext_pipe_tables opts ] ++
+ [ autolinkSpec | isEnabled Ext_autolink_bare_uris opts ] ++
+ [ emojiSpec | isEnabled Ext_emoji opts ] ++
+ [ autoIdentifiersSpec
+ | isEnabled Ext_gfm_auto_identifiers opts
+ , not (isEnabled Ext_ascii_identifiers opts) ] ++
+ [ autoIdentifiersAsciiSpec
+ | isEnabled Ext_gfm_auto_identifiers opts
+ , isEnabled Ext_ascii_identifiers opts ] ++
+ [ implicitHeadingReferencesSpec
+ | isEnabled Ext_implicit_header_references opts ] ++
+ [ footnoteSpec | isEnabled Ext_footnotes opts ] ++
+ [ definitionListSpec | isEnabled Ext_definition_lists opts ] ++
+ [ taskListSpec | isEnabled Ext_task_lists opts ]
-convertEmojis :: Text -> [Inline]
-convertEmojis s@(T.uncons -> Just (':',xs)) =
- case T.break (==':') xs of
- (ys, T.uncons -> Just (':',zs)) ->
- case emojiToInline ys of
- Just em -> em : convertEmojis zs
- Nothing -> Str (":" <> ys) : convertEmojis (":" <> zs)
- _ -> [Str s]
-convertEmojis s =
- case T.break (==':') s of
- ("","") -> []
- (_,"") -> [Str s]
- (xs,ys) -> Str xs : convertEmojis ys
-
-addHeaderIdentifiers :: ReaderOptions -> Pandoc -> Pandoc
-addHeaderIdentifiers opts doc = evalState (walkM (addHeaderId opts) doc) mempty
-
-addHeaderId :: ReaderOptions -> Block -> State (Set.Set Text) Block
-addHeaderId opts (Header lev (_,classes,kvs) ils) = do
- ids <- get
- let ident = uniqueIdent (readerExtensions opts) ils ids
- modify (Set.insert ident)
- return $ Header lev (ident,classes,kvs) ils
-addHeaderId _ x = return x
-
-nodeToPandoc :: ReaderOptions -> Node -> Pandoc
-nodeToPandoc opts (Node _ DOCUMENT nodes) =
- Pandoc nullMeta $ foldr (addBlock opts) [] nodes
-nodeToPandoc opts n = -- shouldn't happen
- Pandoc nullMeta $ foldr (addBlock opts) [] [n]
-
-addBlocks :: ReaderOptions -> [Node] -> [Block]
-addBlocks opts = foldr (addBlock opts) []
-
-addBlock :: ReaderOptions -> Node -> [Block] -> [Block]
-addBlock opts (Node _ PARAGRAPH nodes) =
- case addInlines opts nodes of
- [Image attr alt (src,tit)]
- | isEnabled Ext_implicit_figures opts
- -- the "fig:" prefix indicates an implicit figure
- -> (Para [Image attr alt (src, "fig:" <> tit)] :)
- ils -> (Para ils :)
-addBlock _ (Node _ THEMATIC_BREAK _) =
- (HorizontalRule :)
-addBlock opts (Node _ BLOCK_QUOTE nodes) =
- (BlockQuote (addBlocks opts nodes) :)
-addBlock opts (Node _ (HTML_BLOCK t) _)
- | isEnabled Ext_raw_html opts = (RawBlock (Format "html") t :)
- | otherwise = id
--- Note: the cmark parser will never generate CUSTOM_BLOCK,
--- so we don't need to handle it:
-addBlock _ (Node _ (CUSTOM_BLOCK _onEnter _onExit) _nodes) =
- id
-addBlock _ (Node _ (CODE_BLOCK info t) _) =
- (CodeBlock ("", take 1 (T.words info), []) t :)
-addBlock opts (Node _ (HEADING lev) nodes) =
- (Header lev ("",[],[]) (addInlines opts nodes) :)
-addBlock opts (Node _ (LIST listAttrs) nodes) =
- (constructor (map listItem nodes) :)
- where constructor = case listType listAttrs of
- BULLET_LIST -> BulletList
- ORDERED_LIST -> OrderedList
- (start, DefaultStyle, delim)
- start = listStart listAttrs
- listItem = taskListItemFromAscii exts . setTightness
- . addBlocks opts . children
- setTightness = if listTight listAttrs
- then map paraToPlain
- else id
- paraToPlain (Para xs) = Plain xs
- paraToPlain x = x
- delim = case listDelim listAttrs of
- PERIOD_DELIM -> Period
- PAREN_DELIM -> OneParen
- exts = readerExtensions opts
-addBlock opts (Node _ (TABLE alignments) nodes) =
- (Table
- nullAttr
- (Caption Nothing [])
- (zip aligns widths)
- (TableHead nullAttr headers)
- [TableBody nullAttr 0 [] rows]
- (TableFoot nullAttr []) :)
- where aligns = map fromTableCellAlignment alignments
- fromTableCellAlignment NoAlignment = AlignDefault
- fromTableCellAlignment LeftAligned = AlignLeft
- fromTableCellAlignment RightAligned = AlignRight
- fromTableCellAlignment CenterAligned = AlignCenter
- widths = replicate numcols ColWidthDefault
- numcols = if null rows'
- then 0
- else maximum $ map rowLength rows'
- rows' = map toRow $ filter isRow nodes
- (headers, rows) = case rows' of
- (h:rs) -> ([h], rs)
- [] -> ([], [])
- isRow (Node _ TABLE_ROW _) = True
- isRow _ = False
- isCell (Node _ TABLE_CELL _) = True
- isCell _ = False
- toRow (Node _ TABLE_ROW ns) = Row nullAttr $ map toCell $ filter isCell ns
- toRow (Node _ t _) = error $ "toRow encountered non-row " ++ show t
- toCell (Node _ TABLE_CELL []) = fromSimpleCell []
- toCell (Node _ TABLE_CELL (n:ns))
- | isBlockNode n = fromSimpleCell $ addBlocks opts (n:ns)
- | otherwise = fromSimpleCell [Plain (addInlines opts (n:ns))]
- toCell (Node _ t _) = error $ "toCell encountered non-cell " ++ show t
- fromSimpleCell = Cell nullAttr AlignDefault 1 1
- rowLength (Row _ body) = length body -- all cells are 1×1
-addBlock _ (Node _ TABLE_ROW _) = id -- handled in TABLE
-addBlock _ (Node _ TABLE_CELL _) = id -- handled in TABLE
-addBlock _ _ = id
-
-isBlockNode :: Node -> Bool
-isBlockNode (Node _ nodetype _) =
- case nodetype of
- DOCUMENT -> True
- THEMATIC_BREAK -> True
- PARAGRAPH -> True
- BLOCK_QUOTE -> True
- HTML_BLOCK _ -> True
- CUSTOM_BLOCK _ _ -> True
- CODE_BLOCK _ _ -> True
- HEADING _ -> True
- LIST _ -> True
- ITEM -> True
- TEXT _ -> False
- SOFTBREAK -> False
- LINEBREAK -> False
- HTML_INLINE _ -> False
- CUSTOM_INLINE _ _ -> False
- CODE _ -> False
- EMPH -> False
- STRONG -> False
- LINK _ _ -> False
- IMAGE _ _ -> False
- STRIKETHROUGH -> False
- TABLE _ -> False
- TABLE_ROW -> False
- TABLE_CELL -> False
-
-children :: Node -> [Node]
-children (Node _ _ ns) = ns
-
-addInlines :: ReaderOptions -> [Node] -> [Inline]
-addInlines opts = foldr (addInline opts) []
-
-addInline :: ReaderOptions -> Node -> [Inline] -> [Inline]
-addInline opts (Node _ (TEXT t) _) = (concatMap toinl clumps ++)
- where clumps = T.groupBy samekind t
- samekind ' ' ' ' = True
- samekind ' ' _ = False
- samekind _ ' ' = False
- samekind _ _ = True
- toinl (T.uncons -> Just (' ', _)) = [Space]
- toinl xs = if isEnabled Ext_emoji opts
- then convertEmojis xs
- else [Str xs]
-addInline _ (Node _ LINEBREAK _) = (LineBreak :)
-addInline opts (Node _ SOFTBREAK _)
- | isEnabled Ext_hard_line_breaks opts = (LineBreak :)
- | otherwise = (SoftBreak :)
-addInline opts (Node _ (HTML_INLINE t) _)
- | isEnabled Ext_raw_html opts = (RawInline (Format "html") t :)
- | otherwise = id
--- Note: the cmark parser will never generate CUSTOM_BLOCK,
--- so we don't need to handle it:
-addInline _ (Node _ (CUSTOM_INLINE _onEnter _onExit) _nodes) =
- id
-addInline _ (Node _ (CODE t) _) =
- (Code ("",[],[]) t :)
-addInline opts (Node _ EMPH nodes) =
- (Emph (addInlines opts nodes) :)
-addInline opts (Node _ STRONG nodes) =
- (Strong (addInlines opts nodes) :)
-addInline opts (Node _ STRIKETHROUGH nodes) =
- (Strikeout (addInlines opts nodes) :)
-addInline opts (Node _ (LINK url title) nodes) =
- (Link nullAttr (addInlines opts nodes) (url, title) :)
-addInline opts (Node _ (IMAGE url title) nodes) =
- (Image nullAttr (addInlines opts nodes) (url, title) :)
-addInline _ _ = id