Correctly implement capitalisation.

Using `map toUpper` to capitalise text is wrong, as e.g. “Straße” should be converted to “STRASSE”, which is 1 character longer. This commit adds a `capitalize` function and replaces 2 identical implementations in different modules (`toCaps` and `capitalize`) with it.
author: Artyom Kazak <yom@artyom.me> 2014-08-03 16:48:55 +0400
committer: Artyom Kazak <yom@artyom.me> 2014-08-03 17:37:37 +0400
commit: ec88d47f23d6761cf2120f76e45ca23cdc478e6c (patch)
tree: 97506dc41a66d0050b94d4a1c086c9e28d2a7927 /src/Text/Pandoc
parent: 842c705097fbb3987145eae85da2261cb264e618 (diff)
download: pandoc-ec88d47f23d6761cf2120f76e45ca23cdc478e6c.tar.gz
3 files changed, 19 insertions, 15 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index 9acb959a0..f0e5bbe5d 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -59,6 +59,7 @@ module Text.Pandoc.Shared (
                      normalizeBlocks,
                      removeFormatting,
                      stringify,
+                     capitalize,
                      compactify,
                      compactify',
                      compactify'DL,
@@ -122,6 +123,7 @@ import qualified Data.ByteString.Char8 as B8
 import Text.Pandoc.Compat.Monoid
 import Data.ByteString.Base64 (decodeLenient)
 import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr)
+import qualified Data.Text as T (toUpper, pack, unpack)
 
 #ifdef EMBED_DATA_FILES
 import Text.Pandoc.Data (dataFiles)
@@ -527,6 +529,17 @@ stringify = query go . walk deNote
         deNote (Note _) = Str ""
         deNote x = x
 
+-- | Bring all regular text in a pandoc structure to uppercase.
+-- 
+-- This function correctly handles cases where a lowercase character doesn't
+-- match to a single uppercase character – e.g. “Straße” would be converted
+-- to “STRASSE”, not “STRAßE”.
+capitalize :: Walkable Inline a => a -> a
+capitalize = walk go
+  where go :: Inline -> Inline
+        go (Str s) = Str (T.unpack $ T.toUpper $ T.pack s)
+        go x       = x
+
 -- | Change final list item from @Para@ to @Plain@ if the list contains
 -- no other @Para@ blocks.
 compactify :: [[Block]]  -- ^ List of list items (each a list of blocks)
diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs
index 803617f95..7a9bff4fe 100644
--- a/src/Text/Pandoc/Writers/FB2.hs
+++ b/src/Text/Pandoc/Writers/FB2.hs
@@ -28,7 +28,7 @@ module Text.Pandoc.Writers.FB2 (writeFB2)  where
 import Control.Monad.State (StateT, evalStateT, get, modify)
 import Control.Monad.State (liftM, liftM2, liftIO)
 import Data.ByteString.Base64 (encode)
-import Data.Char (toUpper, toLower, isSpace, isAscii, isControl)
+import Data.Char (toLower, isSpace, isAscii, isControl)
 import Data.List (intersperse, intercalate, isPrefixOf)
 import Data.Either (lefts, rights)
 import Network.Browser (browse, request, setAllowRedirects, setOutHandler)
@@ -44,8 +44,7 @@ import qualified Text.XML.Light.Cursor as XC
 
 import Text.Pandoc.Definition
 import Text.Pandoc.Options (WriterOptions(..), HTMLMathMethod(..), def)
-import Text.Pandoc.Shared (orderedListMarkers, isHeaderBlock)
-import Text.Pandoc.Walk
+import Text.Pandoc.Shared (orderedListMarkers, isHeaderBlock, capitalize)
 
 -- | Data to be written at the end of the document:
 -- (foot)notes, URLs, references, images.
@@ -421,10 +420,6 @@ indent = indentBlock
   indentLines ins = let lns = split isLineBreak ins :: [[Inline]]
                     in  intercalate [LineBreak] $ map ((Str spacer):) lns
 
-capitalize :: Inline -> Inline
-capitalize (Str xs) = Str $ map toUpper xs
-capitalize x = x
-
 -- | Convert a Pandoc's Inline element to FictionBook XML representation.
 toXml :: Inline -> FBM [Content]
 toXml (Str s) = return [txt s]
@@ -434,7 +429,7 @@ toXml (Strong ss) = list `liftM` wrap "strong" ss
 toXml (Strikeout ss) = list `liftM` wrap "strikethrough" ss
 toXml (Superscript ss) = list `liftM` wrap "sup" ss
 toXml (Subscript ss) = list `liftM` wrap "sub" ss
-toXml (SmallCaps ss) = cMapM toXml $ walk capitalize ss
+toXml (SmallCaps ss) = cMapM toXml $ capitalize ss
 toXml (Quoted SingleQuote ss) = do  -- FIXME: should be language-specific
   inner <- cMapM toXml ss
   return $ [txt "‘"] ++ inner ++ [txt "’"]
diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs
index 897e425c6..a859267cc 100644
--- a/src/Text/Pandoc/Writers/Markdown.hs
+++ b/src/Text/Pandoc/Writers/Markdown.hs
@@ -39,7 +39,7 @@ import Text.Pandoc.Writers.Shared
 import Text.Pandoc.Options
 import Text.Pandoc.Parsing hiding (blankline, blanklines, char, space)
 import Data.List ( group, isPrefixOf, find, intersperse, transpose, sortBy )
-import Data.Char ( isSpace, isPunctuation, toUpper )
+import Data.Char ( isSpace, isPunctuation )
 import Data.Ord ( comparing )
 import Text.Pandoc.Pretty
 import Control.Monad.State
@@ -672,10 +672,6 @@ escapeSpaces (Str s) = Str $ substitute " " "\\ " s
 escapeSpaces Space = Str "\\ "
 escapeSpaces x = x
 
-toCaps :: Inline -> Inline
-toCaps (Str s) = Str (map toUpper s)
-toCaps x       = x
-
 -- | Convert Pandoc inline element to markdown.
 inlineToMarkdown :: WriterOptions -> Inline -> State WriterState Doc
 inlineToMarkdown opts (Span attrs ils) = do
@@ -693,7 +689,7 @@ inlineToMarkdown opts (Emph lst) = do
 inlineToMarkdown opts (Strong lst) = do
   plain <- gets stPlain
   if plain
-     then inlineListToMarkdown opts $ walk toCaps lst
+     then inlineListToMarkdown opts $ capitalize lst
      else do
        contents <- inlineListToMarkdown opts lst
        return $ "**" <> contents <> "**"
@@ -716,7 +712,7 @@ inlineToMarkdown opts (Subscript lst) = do
 inlineToMarkdown opts (SmallCaps lst) = do
   plain <- gets stPlain
   if plain
-     then inlineListToMarkdown opts $ walk toCaps lst
+     then inlineListToMarkdown opts $ capitalize lst
      else do
        contents <- inlineListToMarkdown opts lst
        return $ tagWithAttrs "span"
author	Artyom Kazak <yom@artyom.me>	2014-08-03 16:48:55 +0400
committer	Artyom Kazak <yom@artyom.me>	2014-08-03 17:37:37 +0400
commit	ec88d47f23d6761cf2120f76e45ca23cdc478e6c (patch)
tree	97506dc41a66d0050b94d4a1c086c9e28d2a7927 /src/Text/Pandoc
parent	842c705097fbb3987145eae85da2261cb264e618 (diff)
download	pandoc-ec88d47f23d6761cf2120f76e45ca23cdc478e6c.tar.gz