From 904924d172d2fced32a96aa1d022d47a0fb59cd6 Mon Sep 17 00:00:00 2001 From: Anders Waldenborg Date: Fri, 29 Jun 2018 10:41:26 +0200 Subject: CommonMark reader: Handle ascii_identifiers extension (#4733) Non-ascii characters were not stripped from identifiers even if the `ascii_identifiers` extension was enabled (which is is by default for gfm). Closes #4742 --- src/Text/Pandoc/Readers/CommonMark.hs | 31 ++++++++++++++++++------------- test/command/4742.md | 25 +++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 13 deletions(-) create mode 100644 test/command/4742.md diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs index 79a4abbc2..a742ca666 100644 --- a/src/Text/Pandoc/Readers/CommonMark.hs +++ b/src/Text/Pandoc/Readers/CommonMark.hs @@ -39,7 +39,9 @@ import Control.Monad.State import Data.Char (isAlphaNum, isLetter, isSpace, toLower) import Data.List (groupBy) import qualified Data.Map as Map +import Data.Maybe (mapMaybe) import Data.Text (Text, unpack) +import Text.Pandoc.Asciify (toAsciiChar) import Text.Pandoc.Class (PandocMonad) import Text.Pandoc.Definition import Text.Pandoc.Emoji (emojis) @@ -51,7 +53,7 @@ import Text.Pandoc.Walk (walkM) readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc readCommonMark opts s = return $ (if isEnabled Ext_gfm_auto_identifiers opts - then addHeaderIdentifiers + then addHeaderIdentifiers opts else id) $ nodeToPandoc opts $ commonmarkToNode opts' exts s where opts' = [ optSmart | isEnabled Ext_smart opts ] @@ -70,13 +72,13 @@ convertEmojis (':':xs) = convertEmojis (x:xs) = x : convertEmojis xs convertEmojis [] = [] -addHeaderIdentifiers :: Pandoc -> Pandoc -addHeaderIdentifiers doc = evalState (walkM addHeaderId doc) mempty +addHeaderIdentifiers :: ReaderOptions -> Pandoc -> Pandoc +addHeaderIdentifiers opts doc = evalState (walkM (addHeaderId opts) doc) mempty -addHeaderId :: Block -> State (Map.Map String Int) Block -addHeaderId (Header lev (_,classes,kvs) ils) = do +addHeaderId :: ReaderOptions -> Block -> State (Map.Map String Int) Block +addHeaderId opts (Header lev (_,classes,kvs) ils) = do idmap <- get - let ident = toIdent ils + let ident = toIdent opts ils ident' <- case Map.lookup ident idmap of Nothing -> do put (Map.insert ident 1 idmap) @@ -85,13 +87,16 @@ addHeaderId (Header lev (_,classes,kvs) ils) = do put (Map.adjust (+ 1) ident idmap) return (ident ++ "-" ++ show i) return $ Header lev (ident',classes,kvs) ils -addHeaderId x = return x - -toIdent :: [Inline] -> String -toIdent = map (\c -> if isSpace c then '-' else c) - . filter (\c -> isLetter c || isAlphaNum c || isSpace c || - c == '_' || c == '-') - . map toLower . stringify +addHeaderId _ x = return x + +toIdent :: ReaderOptions -> [Inline] -> String +toIdent opts = map (\c -> if isSpace c then '-' else c) + . filterer + . map toLower . stringify + where filterer = if isEnabled Ext_ascii_identifiers opts + then mapMaybe toAsciiChar + else filter (\c -> isLetter c || isAlphaNum c || isSpace c || + c == '_' || c == '-') nodeToPandoc :: ReaderOptions -> Node -> Pandoc nodeToPandoc opts (Node _ DOCUMENT nodes) = diff --git a/test/command/4742.md b/test/command/4742.md new file mode 100644 index 000000000..72751d727 --- /dev/null +++ b/test/command/4742.md @@ -0,0 +1,25 @@ +Check that the commonmark reader handles the `ascii_identifiers` +extension properly. + +``` +% pandoc -f commonmark+gfm_auto_identifiers+ascii_identifiers -t native +# non ascii ⚠️ räksmörgås +^D +[Header 1 ("non-ascii--raksmorgas",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]] +``` + +``` +% pandoc -f commonmark+gfm_auto_identifiers-ascii_identifiers -t native +# non ascii ⚠️ räksmörgås +^D +[Header 1 ("non-ascii-\65039-r\228ksm\246rg\229s",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]] +``` + +`gfm` should have `ascii_identifiers` enabled by default. + +``` +% pandoc -f gfm -t native +# non ascii ⚠️ räksmörgås +^D +[Header 1 ("non-ascii--raksmorgas",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]] +``` -- cgit v1.2.3