From 2d4a22d0be7e2da288c4af43250d6d3c607ccf8b Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Sun, 8 Jul 2007 17:33:03 +0000 Subject: Regularized the scheme for unique header identifiers in HTML writer: - punctuation is now all removed (except -) - spaces are turned into - - all lowercase This scheme should be fairly predictable. Updated tests accordingly. git-svn-id: https://pandoc.googlecode.com/svn/trunk@655 788f1e2b-df1e-0410-8736-df70ead52e1b --- src/Text/Pandoc/Writers/HTML.hs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'src/Text/Pandoc') diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index 54d120879..c3ed92f5c 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -34,7 +34,7 @@ import Text.Pandoc.Entities (decodeEntities) import Text.Regex ( mkRegex, matchRegex ) import Numeric ( showHex ) import Data.Char ( ord, toLower ) -import Data.List ( isPrefixOf, partition ) +import Data.List ( isPrefixOf, partition, intersperse ) import Control.Monad.State import Text.XHtml.Strict @@ -169,22 +169,32 @@ obfuscateChar char = obfuscateString :: String -> String obfuscateString = (concatMap obfuscateChar) . decodeEntities +-- | True if character is a punctuation character (unicode). +isPunctuation :: Char -> Bool +isPunctuation c = + let c' = ord c in + if (c `elem` "!\"'()*,-./:;<>?[\\]`{|}~") || (c' >= 0x2000 && c' <= 0x206F) || + (c' >= 0xE000 && c' <= 0xE0FF) + then True + else False + -- | Convert Pandoc inline list to plain text identifier. inlineListToIdentifier :: [Inline] -> String inlineListToIdentifier [] = "" inlineListToIdentifier (x:xs) = xAsText ++ inlineListToIdentifier xs where xAsText = case x of - Str s -> map toLower s + Str s -> filter (\c -> (c == '-') || not (isPunctuation c)) $ + concat $ intersperse "-" $ words $ map toLower s Emph lst -> inlineListToIdentifier lst Strong lst -> inlineListToIdentifier lst Quoted _ lst -> inlineListToIdentifier lst Code s -> s Space -> "-" - EmDash -> "--" + EmDash -> "-" EnDash -> "-" Apostrophe -> "" - Ellipses -> "..." + Ellipses -> "" LineBreak -> "-" TeX _ -> "" HtmlInline _ -> "" -- cgit v1.2.3