diff options
author | Anders Waldenborg <anders@0x63.nu> | 2018-06-29 10:41:26 +0200 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2018-06-29 10:41:26 +0200 |
commit | 904924d172d2fced32a96aa1d022d47a0fb59cd6 (patch) | |
tree | 6d331bb9ea93612b88071e0430a36e66b8fbf4c8 | |
parent | bb5a2464d54b76e97e42a015676042244bd09970 (diff) | |
download | pandoc-904924d172d2fced32a96aa1d022d47a0fb59cd6.tar.gz |
CommonMark reader: Handle ascii_identifiers extension (#4733)
Non-ascii characters were not stripped from identifiers even if the
`ascii_identifiers` extension was enabled (which is is by default for
gfm).
Closes #4742
-rw-r--r-- | src/Text/Pandoc/Readers/CommonMark.hs | 31 | ||||
-rw-r--r-- | test/command/4742.md | 25 |
2 files changed, 43 insertions, 13 deletions
diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs index 79a4abbc2..a742ca666 100644 --- a/src/Text/Pandoc/Readers/CommonMark.hs +++ b/src/Text/Pandoc/Readers/CommonMark.hs @@ -39,7 +39,9 @@ import Control.Monad.State import Data.Char (isAlphaNum, isLetter, isSpace, toLower) import Data.List (groupBy) import qualified Data.Map as Map +import Data.Maybe (mapMaybe) import Data.Text (Text, unpack) +import Text.Pandoc.Asciify (toAsciiChar) import Text.Pandoc.Class (PandocMonad) import Text.Pandoc.Definition import Text.Pandoc.Emoji (emojis) @@ -51,7 +53,7 @@ import Text.Pandoc.Walk (walkM) readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc readCommonMark opts s = return $ (if isEnabled Ext_gfm_auto_identifiers opts - then addHeaderIdentifiers + then addHeaderIdentifiers opts else id) $ nodeToPandoc opts $ commonmarkToNode opts' exts s where opts' = [ optSmart | isEnabled Ext_smart opts ] @@ -70,13 +72,13 @@ convertEmojis (':':xs) = convertEmojis (x:xs) = x : convertEmojis xs convertEmojis [] = [] -addHeaderIdentifiers :: Pandoc -> Pandoc -addHeaderIdentifiers doc = evalState (walkM addHeaderId doc) mempty +addHeaderIdentifiers :: ReaderOptions -> Pandoc -> Pandoc +addHeaderIdentifiers opts doc = evalState (walkM (addHeaderId opts) doc) mempty -addHeaderId :: Block -> State (Map.Map String Int) Block -addHeaderId (Header lev (_,classes,kvs) ils) = do +addHeaderId :: ReaderOptions -> Block -> State (Map.Map String Int) Block +addHeaderId opts (Header lev (_,classes,kvs) ils) = do idmap <- get - let ident = toIdent ils + let ident = toIdent opts ils ident' <- case Map.lookup ident idmap of Nothing -> do put (Map.insert ident 1 idmap) @@ -85,13 +87,16 @@ addHeaderId (Header lev (_,classes,kvs) ils) = do put (Map.adjust (+ 1) ident idmap) return (ident ++ "-" ++ show i) return $ Header lev (ident',classes,kvs) ils -addHeaderId x = return x - -toIdent :: [Inline] -> String -toIdent = map (\c -> if isSpace c then '-' else c) - . filter (\c -> isLetter c || isAlphaNum c || isSpace c || - c == '_' || c == '-') - . map toLower . stringify +addHeaderId _ x = return x + +toIdent :: ReaderOptions -> [Inline] -> String +toIdent opts = map (\c -> if isSpace c then '-' else c) + . filterer + . map toLower . stringify + where filterer = if isEnabled Ext_ascii_identifiers opts + then mapMaybe toAsciiChar + else filter (\c -> isLetter c || isAlphaNum c || isSpace c || + c == '_' || c == '-') nodeToPandoc :: ReaderOptions -> Node -> Pandoc nodeToPandoc opts (Node _ DOCUMENT nodes) = diff --git a/test/command/4742.md b/test/command/4742.md new file mode 100644 index 000000000..72751d727 --- /dev/null +++ b/test/command/4742.md @@ -0,0 +1,25 @@ +Check that the commonmark reader handles the `ascii_identifiers` +extension properly. + +``` +% pandoc -f commonmark+gfm_auto_identifiers+ascii_identifiers -t native +# non ascii ⚠️ räksmörgås +^D +[Header 1 ("non-ascii--raksmorgas",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]] +``` + +``` +% pandoc -f commonmark+gfm_auto_identifiers-ascii_identifiers -t native +# non ascii ⚠️ räksmörgås +^D +[Header 1 ("non-ascii-\65039-r\228ksm\246rg\229s",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]] +``` + +`gfm` should have `ascii_identifiers` enabled by default. + +``` +% pandoc -f gfm -t native +# non ascii ⚠️ räksmörgås +^D +[Header 1 ("non-ascii--raksmorgas",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]] +``` |