From 5bc38a741bdd5a0470b92c4cd62769bb8dd4ddf2 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 11 Nov 2018 20:45:38 -0800 Subject: Exactly match GitHub's identifier generating algorithm. See #5057. --- src/Text/Pandoc/Shared.hs | 9 +++++++-- test/command/4742.md | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 0b29347a3..0bb9b20d7 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -116,7 +116,9 @@ import Control.Monad (MonadPlus (..), msum, unless) import qualified Control.Monad.State.Strict as S import qualified Data.ByteString.Lazy as BL import qualified Data.Bifunctor as Bifunctor -import Data.Char (isAlpha, isLower, isSpace, isUpper, toLower, isAlphaNum) +import Data.Char (isAlpha, isLower, isSpace, isUpper, toLower, isAlphaNum, + generalCategory, GeneralCategory(NonSpacingMark, + SpacingCombiningMark, EnclosingMark, ConnectorPunctuation)) import Data.Data (Data, Typeable) import Data.List (find, intercalate, intersperse, stripPrefix) import qualified Data.Map as M @@ -504,7 +506,10 @@ inlineListToIdentifier exts = | otherwise = intercalate "-" . words . filterPunct . map toLower filterPunct = filter (\c -> isSpace c || isAlphaNum c || isAllowedPunct c) isAllowedPunct c - | extensionEnabled Ext_gfm_auto_identifiers exts = c == '_' || c == '-' + | extensionEnabled Ext_gfm_auto_identifiers exts + = c == '-' || c == '_' || + generalCategory c `elem` [NonSpacingMark, SpacingCombiningMark, + EnclosingMark, ConnectorPunctuation] | otherwise = c == '_' || c == '-' || c == '.' spaceToDash = map (\c -> if isSpace c then '-' else c) diff --git a/test/command/4742.md b/test/command/4742.md index f97314b10..600880e90 100644 --- a/test/command/4742.md +++ b/test/command/4742.md @@ -8,6 +8,10 @@ extension properly. [Header 1 ("non-ascii--raksmorgas",[],[]) [Str "non",Space,Str "ascii",Space,Str "\9888\65039",Space,Str "r\228ksm\246rg\229s"]] ``` +Note that the emoji here is actually a composite character, +formed from \9888 and \65039. The latter is a combining mark, +so it survives... + ``` % pandoc -f commonmark+auto_identifiers+gfm_auto_identifiers-ascii_identifiers -t native # non ascii ⚠️ räksmörgås -- cgit v1.2.3