aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnders Waldenborg <anders@0x63.nu>2018-06-29 10:41:26 +0200
committerJohn MacFarlane <jgm@berkeley.edu>2018-06-29 10:41:26 +0200
commit904924d172d2fced32a96aa1d022d47a0fb59cd6 (patch)
tree6d331bb9ea93612b88071e0430a36e66b8fbf4c8
parentbb5a2464d54b76e97e42a015676042244bd09970 (diff)
downloadpandoc-904924d172d2fced32a96aa1d022d47a0fb59cd6.tar.gz
CommonMark reader: Handle ascii_identifiers extension (#4733)
Non-ascii characters were not stripped from identifiers even if the `ascii_identifiers` extension was enabled (which is is by default for gfm). Closes #4742
-rw-r--r--src/Text/Pandoc/Readers/CommonMark.hs31
-rw-r--r--test/command/4742.md25
2 files changed, 43 insertions, 13 deletions
diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs
index 79a4abbc2..a742ca666 100644
--- a/src/Text/Pandoc/Readers/CommonMark.hs
+++ b/src/Text/Pandoc/Readers/CommonMark.hs
@@ -39,7 +39,9 @@ import Control.Monad.State
import Data.Char (isAlphaNum, isLetter, isSpace, toLower)
import Data.List (groupBy)
import qualified Data.Map as Map
+import Data.Maybe (mapMaybe)
import Data.Text (Text, unpack)
+import Text.Pandoc.Asciify (toAsciiChar)
import Text.Pandoc.Class (PandocMonad)
import Text.Pandoc.Definition
import Text.Pandoc.Emoji (emojis)
@@ -51,7 +53,7 @@ import Text.Pandoc.Walk (walkM)
readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readCommonMark opts s = return $
(if isEnabled Ext_gfm_auto_identifiers opts
- then addHeaderIdentifiers
+ then addHeaderIdentifiers opts
else id) $
nodeToPandoc opts $ commonmarkToNode opts' exts s
where opts' = [ optSmart | isEnabled Ext_smart opts ]
@@ -70,13 +72,13 @@ convertEmojis (':':xs) =
convertEmojis (x:xs) = x : convertEmojis xs
convertEmojis [] = []
-addHeaderIdentifiers :: Pandoc -> Pandoc
-addHeaderIdentifiers doc = evalState (walkM addHeaderId doc) mempty
+addHeaderIdentifiers :: ReaderOptions -> Pandoc -> Pandoc
+addHeaderIdentifiers opts doc = evalState (walkM (addHeaderId opts) doc) mempty
-addHeaderId :: Block -> State (Map.Map String Int) Block
-addHeaderId (Header lev (_,classes,kvs) ils) = do
+addHeaderId :: ReaderOptions -> Block -> State (Map.Map String Int) Block
+addHeaderId opts (Header lev (_,classes,kvs) ils) = do
idmap <- get
- let ident = toIdent ils
+ let ident = toIdent opts ils
ident' <- case Map.lookup ident idmap of
Nothing -> do
put (Map.insert ident 1 idmap)
@@ -85,13 +87,16 @@ addHeaderId (Header lev (_,classes,kvs) ils) = do
put (Map.adjust (+ 1) ident idmap)
return (ident ++ "-" ++ show i)
return $ Header lev (ident',classes,kvs) ils
-addHeaderId x = return x
-
-toIdent :: [Inline] -> String
-toIdent = map (\c -> if isSpace c then '-' else c)
- . filter (\c -> isLetter c || isAlphaNum c || isSpace c ||
- c == '_' || c == '-')
- . map toLower . stringify
+addHeaderId _ x = return x
+
+toIdent :: ReaderOptions -> [Inline] -> String
+toIdent opts = map (\c -> if isSpace c then '-' else c)
+ . filterer
+ . map toLower . stringify
+ where filterer = if isEnabled Ext_ascii_identifiers opts
+ then mapMaybe toAsciiChar
+ else filter (\c -> isLetter c || isAlphaNum c || isSpace c ||
+ c == '_' || c == '-')
nodeToPandoc :: ReaderOptions -> Node -> Pandoc
nodeToPandoc opts (Node _ DOCUMENT nodes) =
diff --git a/test/command/4742.md b/test/command/4742.md
new file mode 100644
index 000000000..72751d727
--- /dev/null
+++ b/test/command/4742.md
@@ -0,0 +1,25 @@
+Check that the commonmark reader handles the `ascii_identifiers`
+extension properly.
+
+```
+% pandoc -f commonmark+gfm_auto_identifiers+ascii_identifiers -t native
+# non ascii ⚠️ räksmörgås
+^D
+[Header 1 ("non-ascii--raksmorgas",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]]
+```
+
+```
+% pandoc -f commonmark+gfm_auto_identifiers-ascii_identifiers -t native
+# non ascii ⚠️ räksmörgås
+^D
+[Header 1 ("non-ascii-\65039-r\228ksm\246rg\229s",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]]
+```
+
+`gfm` should have `ascii_identifiers` enabled by default.
+
+```
+% pandoc -f gfm -t native
+# non ascii ⚠️ räksmörgås
+^D
+[Header 1 ("non-ascii--raksmorgas",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]]
+```