From e37cf4484d38c171d9f7477a8ae9eca9643cc426 Mon Sep 17 00:00:00 2001
From: OCzarnecki <44535552+OCzarnecki@users.noreply.github.com>
Date: Mon, 16 Aug 2021 06:57:57 +0200
Subject: Multimarkdown sub- and superscripts (#5512) (#7188)

Added an extension `short_subsuperscripts` which modifies the behavior
of `subscript` and `superscript`, allowing subscripts or superscripts containing only
alphanumerics to end with a space character (eg. `x^2 = 4` or `H~2 is
combustible`).  This improves support for multimarkdown. Closes #5512.

Add `Ext_short_subsuperscripts` constructor to `Extension` [API change].
This is enabled by default for `markdown_mmd`.
---
 MANUAL.txt                          | 12 ++++++++++
 src/Text/Pandoc/Extensions.hs       | 11 ++++-----
 src/Text/Pandoc/Readers/Markdown.hs | 24 ++++++++++++-------
 test/Tests/Readers/Markdown.hs      | 48 +++++++++++++++++++++++++++++++++++++
 4 files changed, 80 insertions(+), 15 deletions(-)
diff --git a/MANUAL.txt b/MANUAL.txt
index a6edd8ccd..bed3b2009 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -5304,6 +5304,18 @@ For elements that accept attributes, a `data-pos` attribute
 is added; other elements are placed in a surrounding
 Div or Span elemnet with a `data-pos` attribute.
 
+#### Extension: `short_subsuperscript` ####
+
+Parse multimarkdown style subscripts and superscripts, which start with
+a '~' or '^' character, respectively, and include the alphanumeric sequence
+that follows. For example:
+
+    x^2 = 4
+
+or
+
+    Oxygen is O~2.
+
 ## Markdown variants
 
 In addition to pandoc's extended Markdown, the following Markdown
diff --git a/src/Text/Pandoc/Extensions.hs b/src/Text/Pandoc/Extensions.hs
index ce04ce641..2ef8f64e9 100644
--- a/src/Text/Pandoc/Extensions.hs
+++ b/src/Text/Pandoc/Extensions.hs
@@ -124,6 +124,7 @@ data Extension =
     | Ext_mmd_header_identifiers -- ^ Multimarkdown style header identifiers [myid]
     | Ext_mmd_link_attributes     -- ^ MMD style reference link attributes
     | Ext_mmd_title_block     -- ^ Multimarkdown metadata block
+    | Ext_short_subsuperscripts -- ^ sub-&superscripts w/o closing char (v~i)
     | Ext_multiline_tables    -- ^ Pandoc-style multiline tables
     | Ext_native_divs             -- ^ Use Div blocks for contents of <div> tags
     | Ext_native_spans            -- ^ Use Span inlines for contents of <span>
@@ -286,14 +287,9 @@ multimarkdownExtensions = extensionsFromList
   , Ext_auto_identifiers
   , Ext_mmd_header_identifiers
   , Ext_implicit_figures
-  -- Note: MMD's syntax for superscripts and subscripts
-  -- is a bit more permissive than pandoc's, allowing
-  -- e^2 and a~1 instead of e^2^ and a~1~, so even with
-  -- these options we don't have full support for MMD
-  -- superscripts and subscripts, but there's no reason
-  -- not to include these:
-  , Ext_superscript
+  , Ext_short_subsuperscripts
   , Ext_subscript
+  , Ext_superscript
   , Ext_backtick_code_blocks
   , Ext_spaced_reference_links
   -- So far only in dev version of mmd:
@@ -464,6 +460,7 @@ getAllExtensions f = universalExtensions <> getAll f
        , Ext_gutenberg
        , Ext_smart
        , Ext_literate_haskell
+       , Ext_short_subsuperscripts
        , Ext_rebase_relative_paths
        ]
   getAll "markdown_strict"   = allMarkdownExtensions
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 2dc7ddf52..536e502cf 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -1692,21 +1692,29 @@ strikeout = fmap B.strikeout <$>
 
 superscript :: PandocMonad m => MarkdownParser m (F Inlines)
 superscript = do
-  guardEnabled Ext_superscript
   fmap B.superscript <$> try (do
     char '^'
-    mconcat <$> many1Till (do notFollowedBy spaceChar
-                              notFollowedBy newline
-                              inline) (char '^'))
+    mconcat <$> (try regularSuperscript <|> try mmdShortSuperscript))
+      where regularSuperscript = many1Till (do guardEnabled Ext_superscript
+                                               notFollowedBy spaceChar
+                                               notFollowedBy newline
+                                               inline) (char '^')
+            mmdShortSuperscript = do guardEnabled Ext_short_subsuperscripts
+                                     result <- take1WhileP isAlphaNum
+                                     return $ return $ return $ B.str result
 
 subscript :: PandocMonad m => MarkdownParser m (F Inlines)
 subscript = do
-  guardEnabled Ext_subscript
   fmap B.subscript <$> try (do
     char '~'
-    mconcat <$> many1Till (do notFollowedBy spaceChar
-                              notFollowedBy newline
-                              inline) (char '~'))
+    mconcat <$> (try regularSubscript <|> mmdShortSubscript))
+      where regularSubscript = many1Till (do guardEnabled Ext_subscript
+                                             notFollowedBy spaceChar
+                                             notFollowedBy newline
+                                             inline) (char '~')
+            mmdShortSubscript = do guardEnabled Ext_short_subsuperscripts
+                                   result <- take1WhileP isAlphaNum
+                                   return $ return $ return $ B.str result
 
 whitespace :: PandocMonad m => MarkdownParser m (F Inlines)
 whitespace = spaceChar >> return <$> (lb <|> regsp) <?> "whitespace"
diff --git a/test/Tests/Readers/Markdown.hs b/test/Tests/Readers/Markdown.hs
index f055ab197..02fc0d8ce 100644
--- a/test/Tests/Readers/Markdown.hs
+++ b/test/Tests/Readers/Markdown.hs
@@ -36,6 +36,9 @@ markdownGH :: Text -> Pandoc
 markdownGH = purely $ readMarkdown def {
                 readerExtensions = githubMarkdownExtensions }
 
+markdownMMD :: Text -> Pandoc
+markdownMMD = purely $ readMarkdown def {
+                 readerExtensions = multimarkdownExtensions }
 infix 4 =:
 (=:) :: ToString c
      => String -> (Text, c) -> TestTree
@@ -360,6 +363,51 @@ tests = [ testGroup "inline code"
             ("**this should \"be bold**"
             =?> para (strong "this should \8220be bold"))
           ]
+        , testGroup "sub- and superscripts"
+          [
+            test markdownMMD "normal subscript"
+            ("H~2~"
+            =?> para ("H" <> subscript "2"))
+          , test markdownMMD "normal superscript"
+            ("x^3^"
+            =?> para ("x" <> superscript "3"))
+          , test markdownMMD "short subscript delimeted by space"
+            ("O~2 is dangerous"
+            =?> para ("O" <> subscript "2" <> space <> "is dangerous"))
+          , test markdownMMD "short subscript delimeted by newline"
+            ("O~2\n"
+            =?> para ("O" <> subscript "2"))
+          , test markdownMMD "short subscript delimeted by EOF"
+            ("O~2"
+            =?> para ("O" <> subscript "2"))
+          , test markdownMMD "short subscript delimited by punctuation"
+            ("O~2."
+            =?> para ("O" <> subscript "2" <> "."))
+          , test markdownMMD "short subscript delimited by emph"
+            ("O~2*combustible!*"
+            =?> para ("O" <> subscript "2" <> emph "combustible!"))
+          , test markdownMMD "no nesting in short subscripts"
+            ("y~*2*"
+            =?> para ("y~" <> emph "2"))
+          , test markdownMMD "short superscript delimeted by space"
+            ("x^2 = y"
+            =?> para ("x" <> superscript "2" <> space <> "= y"))
+          , test markdownMMD "short superscript delimeted by newline"
+            ("x^2\n"
+            =?> para ("x" <> superscript "2"))
+          , test markdownMMD "short superscript delimeted by ExF"
+            ("x^2"
+            =?> para ("x" <> superscript "2"))
+          , test markdownMMD "short superscript delimited by punctuation"
+            ("x^2."
+            =?> para ("x" <> superscript "2" <> "."))
+          , test markdownMMD "short superscript delimited by emph"
+            ("x^2*combustible!*"
+            =?> para ("x" <> superscript "2" <> emph "combustible!"))
+          , test markdownMMD "no nesting in short superscripts"
+            ("y^*2*"
+            =?> para ("y^" <> emph "2"))
+          ]
         , testGroup "footnotes"
           [ "indent followed by newline and flush-left text" =:
             "[^1]\n\n[^1]: my note\n\n     \nnot in note\n"
-- 
cgit v1.2.3