Shared: Fixed uniqueIdent so it behaves as described in README.

Previously some characters that are illegal in HTML identifiers, such as '<', were being allowed in header identifiers. The logic has now been fixed. Thanks to Xyne for reporting.
author: John MacFarlane <jgm@berkeley.edu> 2010-03-28 22:29:31 -0700
committer: John MacFarlane <jgm@berkeley.edu> 2010-03-28 22:29:31 -0700
commit: c5c4e19c99003e64665d244bb40ffa0eff93d269 (patch)
tree: a6c4644667a1af55caa2ceb3ac65cfdbbafb3476
parent: be832b3676f01873e38bfd380a1d03f3c06c6d1e (diff)
download: pandoc-c5c4e19c99003e64665d244bb40ffa0eff93d269.tar.gz
1 files changed, 34 insertions, 33 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index 54d3f9a43..26aff4250 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -117,7 +117,7 @@ import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text,
 import qualified Text.PrettyPrint.HughesPJ as PP
 import Text.Pandoc.CharacterReferences ( characterReference )
 import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha, isAscii,
-                   isPunctuation )
+                   isLetter, isDigit )
 import Data.List ( find, isPrefixOf, intercalate )
 import Network.URI ( parseURI, URI (..), isAllowedInURI, escapeURIString, unEscapeString )
 import Codec.Binary.UTF8.String ( encodeString, decodeString )
@@ -914,38 +914,37 @@ data Element = Blk Block
              --    lvl  num ident  label    contents
              deriving (Eq, Read, Show, Typeable, Data)
 
--- | Convert Pandoc inline list to plain text identifier.
+-- | Convert Pandoc inline list to plain text identifier.  HTML
+-- identifiers must start with a letter, and may contain only
+-- letters, digits, and the characters _-:.
 inlineListToIdentifier :: [Inline] -> String
-inlineListToIdentifier = dropWhile (not . isAlpha) . inlineListToIdentifier'
-
-inlineListToIdentifier' :: [Inline] -> [Char]
-inlineListToIdentifier' [] = ""
-inlineListToIdentifier' (x:xs) =
-  xAsText ++ inlineListToIdentifier' xs
-  where xAsText = case x of
-          Str s          -> filter (\c -> c `elem` "_-." || not (isPunctuation c)) $
-                            intercalate "-" $ words $ map toLower s
-          Emph lst       -> inlineListToIdentifier' lst
-          Strikeout lst  -> inlineListToIdentifier' lst
-          Superscript lst -> inlineListToIdentifier' lst
-          SmallCaps   lst -> inlineListToIdentifier' lst
-          Subscript lst  -> inlineListToIdentifier' lst
-          Strong lst     -> inlineListToIdentifier' lst
-          Quoted _ lst   -> inlineListToIdentifier' lst
-          Cite   _ lst   -> inlineListToIdentifier' lst
-          Code s         -> s
-          Space          -> "-"
-          EmDash         -> "-"
-          EnDash         -> "-"
-          Apostrophe     -> ""
-          Ellipses       -> ""
-          LineBreak      -> "-"
-          Math _ _       -> ""
-          TeX _          -> ""
-          HtmlInline _   -> ""
-          Link lst _     -> inlineListToIdentifier' lst
-          Image lst _    -> inlineListToIdentifier' lst
-          Note _         -> ""
+inlineListToIdentifier =
+  dropWhile (not . isAlpha) . intercalate "-" . words . map toLower .
+  filter (\c -> isLetter c || isDigit c || c `elem` "_-:. ") .
+  concatMap extractText
+    where extractText x = case x of
+              Str s           -> s
+              Emph lst        -> concatMap extractText lst
+              Strikeout lst   -> concatMap extractText lst
+              Superscript lst -> concatMap extractText lst
+              SmallCaps   lst -> concatMap extractText lst
+              Subscript lst   -> concatMap extractText lst
+              Strong lst      -> concatMap extractText lst
+              Quoted _ lst    -> concatMap extractText lst
+              Cite   _ lst    -> concatMap extractText lst
+              Code s          -> s
+              Space           -> " "
+              EmDash          -> "---"
+              EnDash          -> "--"
+              Apostrophe      -> ""
+              Ellipses        -> "..."
+              LineBreak       -> " "
+              Math _ s        -> s
+              TeX _           -> ""
+              HtmlInline _    -> ""
+              Link lst _      -> concatMap extractText lst
+              Image lst _     -> concatMap extractText lst
+              Note _          -> ""
 
 -- | Convert list of Pandoc blocks into (hierarchical) list of Elements
 hierarchicalize :: [Block] -> [Element]
@@ -977,7 +976,9 @@ headerLtEq _ _ = False
 -- Second argument is a list of already used identifiers.
 uniqueIdent :: [Inline] -> [String] -> String
 uniqueIdent title' usedIdents =
-  let baseIdent = inlineListToIdentifier title'
+  let baseIdent = case inlineListToIdentifier title' of
+                        ""   -> "section"
+                        x    -> x
       numIdent n = baseIdent ++ "-" ++ show n
   in  if baseIdent `elem` usedIdents
         then case find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int]) of
author	John MacFarlane <jgm@berkeley.edu>	2010-03-28 22:29:31 -0700
committer	John MacFarlane <jgm@berkeley.edu>	2010-03-28 22:29:31 -0700
commit	c5c4e19c99003e64665d244bb40ffa0eff93d269 (patch)
tree	a6c4644667a1af55caa2ceb3ac65cfdbbafb3476
parent	be832b3676f01873e38bfd380a1d03f3c06c6d1e (diff)
download	pandoc-c5c4e19c99003e64665d244bb40ffa0eff93d269.tar.gz