From c5c4e19c99003e64665d244bb40ffa0eff93d269 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 28 Mar 2010 22:29:31 -0700 Subject: Shared: Fixed uniqueIdent so it behaves as described in README. Previously some characters that are illegal in HTML identifiers, such as '<', were being allowed in header identifiers. The logic has now been fixed. Thanks to Xyne for reporting. --- src/Text/Pandoc/Shared.hs | 67 ++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 54d3f9a43..26aff4250 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -117,7 +117,7 @@ import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, import qualified Text.PrettyPrint.HughesPJ as PP import Text.Pandoc.CharacterReferences ( characterReference ) import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha, isAscii, - isPunctuation ) + isLetter, isDigit ) import Data.List ( find, isPrefixOf, intercalate ) import Network.URI ( parseURI, URI (..), isAllowedInURI, escapeURIString, unEscapeString ) import Codec.Binary.UTF8.String ( encodeString, decodeString ) @@ -914,38 +914,37 @@ data Element = Blk Block -- lvl num ident label contents deriving (Eq, Read, Show, Typeable, Data) --- | Convert Pandoc inline list to plain text identifier. +-- | Convert Pandoc inline list to plain text identifier. HTML +-- identifiers must start with a letter, and may contain only +-- letters, digits, and the characters _-:. inlineListToIdentifier :: [Inline] -> String -inlineListToIdentifier = dropWhile (not . isAlpha) . inlineListToIdentifier' - -inlineListToIdentifier' :: [Inline] -> [Char] -inlineListToIdentifier' [] = "" -inlineListToIdentifier' (x:xs) = - xAsText ++ inlineListToIdentifier' xs - where xAsText = case x of - Str s -> filter (\c -> c `elem` "_-." || not (isPunctuation c)) $ - intercalate "-" $ words $ map toLower s - Emph lst -> inlineListToIdentifier' lst - Strikeout lst -> inlineListToIdentifier' lst - Superscript lst -> inlineListToIdentifier' lst - SmallCaps lst -> inlineListToIdentifier' lst - Subscript lst -> inlineListToIdentifier' lst - Strong lst -> inlineListToIdentifier' lst - Quoted _ lst -> inlineListToIdentifier' lst - Cite _ lst -> inlineListToIdentifier' lst - Code s -> s - Space -> "-" - EmDash -> "-" - EnDash -> "-" - Apostrophe -> "" - Ellipses -> "" - LineBreak -> "-" - Math _ _ -> "" - TeX _ -> "" - HtmlInline _ -> "" - Link lst _ -> inlineListToIdentifier' lst - Image lst _ -> inlineListToIdentifier' lst - Note _ -> "" +inlineListToIdentifier = + dropWhile (not . isAlpha) . intercalate "-" . words . map toLower . + filter (\c -> isLetter c || isDigit c || c `elem` "_-:. ") . + concatMap extractText + where extractText x = case x of + Str s -> s + Emph lst -> concatMap extractText lst + Strikeout lst -> concatMap extractText lst + Superscript lst -> concatMap extractText lst + SmallCaps lst -> concatMap extractText lst + Subscript lst -> concatMap extractText lst + Strong lst -> concatMap extractText lst + Quoted _ lst -> concatMap extractText lst + Cite _ lst -> concatMap extractText lst + Code s -> s + Space -> " " + EmDash -> "---" + EnDash -> "--" + Apostrophe -> "" + Ellipses -> "..." + LineBreak -> " " + Math _ s -> s + TeX _ -> "" + HtmlInline _ -> "" + Link lst _ -> concatMap extractText lst + Image lst _ -> concatMap extractText lst + Note _ -> "" -- | Convert list of Pandoc blocks into (hierarchical) list of Elements hierarchicalize :: [Block] -> [Element] @@ -977,7 +976,9 @@ headerLtEq _ _ = False -- Second argument is a list of already used identifiers. uniqueIdent :: [Inline] -> [String] -> String uniqueIdent title' usedIdents = - let baseIdent = inlineListToIdentifier title' + let baseIdent = case inlineListToIdentifier title' of + "" -> "section" + x -> x numIdent n = baseIdent ++ "-" ++ show n in if baseIdent `elem` usedIdents then case find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int]) of -- cgit v1.2.3