diff options
author | John MacFarlane <fiddlosopher@gmail.com> | 2012-09-23 22:12:21 -0700 |
---|---|---|
committer | John MacFarlane <fiddlosopher@gmail.com> | 2012-09-23 22:12:21 -0700 |
commit | 7272735b3d413a644fd9ab01eeae8ae9cd5a925b (patch) | |
tree | bb0fa978cee6702d0bb3300f234b1f08d2071c5f /src/Text | |
parent | 31107741f0ee69d444e5f9ed2d8272583024e10c (diff) | |
download | pandoc-7272735b3d413a644fd9ab01eeae8ae9cd5a925b.tar.gz |
More intelligent handling of text encodings.
Previously, UTF-8 was enforced for both input and output.
The new system:
* For input, UTF-8 is tried first; if an error is raised, the
locale encoding is tried.
* For output, the locale encoding is always used.
Diffstat (limited to 'src/Text')
-rw-r--r-- | src/Text/Pandoc/Parsing.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 6 | ||||
-rw-r--r-- | src/Text/Pandoc/UTF8.hs | 20 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/Docx.hs | 5 |
4 files changed, 22 insertions, 12 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index bee96be82..de4e3a65d 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -146,7 +146,6 @@ where import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Builder (Blocks) -import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) import Text.Parsec import Text.Parsec.Pos (newPos) import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit, isPunctuation ) @@ -708,7 +707,7 @@ readWith parser state input = testStringWith :: (Show a) => Parser [Char] ParserState a -> String -> IO () -testStringWith parser str = UTF8.putStrLn $ show $ +testStringWith parser str = putStrLn $ show $ readWith parser defaultParserState str -- | Parsing options. diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index d86f9a390..fa928455d 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -90,7 +90,7 @@ import Paths_pandoc (getDataFileName) import Text.Pandoc.Pretty (charWidth) import System.Locale (defaultTimeLocale) import Data.Time -import System.IO (stderr) +import System.IO (stderr, hPutStrLn) import Text.HTML.TagSoup (renderTagsOptions, RenderOptions(..), Tag(..), renderOptions) @@ -503,14 +503,14 @@ readDataFile userDir fname = findDataFile userDir fname >>= UTF8.readFile err :: Int -> String -> IO a err exitCode msg = do name <- getProgName - UTF8.hPutStrLn stderr $ name ++ ": " ++ msg + hPutStrLn stderr $ name ++ ": " ++ msg exitWith $ ExitFailure exitCode return undefined warn :: String -> IO () warn msg = do name <- getProgName - UTF8.hPutStrLn stderr $ name ++ ": " ++ msg + hPutStrLn stderr $ name ++ ": " ++ msg -- -- Safe read diff --git a/src/Text/Pandoc/UTF8.hs b/src/Text/Pandoc/UTF8.hs index aa3a9da04..45664892a 100644 --- a/src/Text/Pandoc/UTF8.hs +++ b/src/Text/Pandoc/UTF8.hs @@ -25,7 +25,11 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Stability : alpha Portability : portable -UTF-8 aware string IO functions that will work with GHC 6.10, 6.12, or 7. +UTF-8 aware string IO functions that will work with GHC 6.12 or 7. +The reading functions first attempt to read UTF-8; if an encoding +error is encountered, the local encoding is used instead. This +should work well in practice because text in other encodings +is usually not valid UTF-8. -} module Text.Pandoc.UTF8 ( readFile , writeFile @@ -45,10 +49,11 @@ where #else import Codec.Binary.UTF8.String (encodeString, decodeString) #endif - +import Control.Exception (catch, throwIO) +import GHC.IO.Exception (IOException(..), IOErrorType(..)) import System.IO hiding (readFile, writeFile, getContents, putStr, putStrLn, hPutStr, hPutStrLn, hGetContents) -import Prelude hiding (readFile, writeFile, getContents, putStr, putStrLn ) +import Prelude hiding (readFile, writeFile, getContents, putStr, putStrLn, catch ) import qualified System.IO as IO readFile :: FilePath -> IO String @@ -75,7 +80,14 @@ hPutStrLn :: Handle -> String -> IO () hPutStrLn h s = hSetEncoding h utf8 >> IO.hPutStrLn h s hGetContents :: Handle -> IO String -hGetContents h = hSetEncoding h utf8_bom >> IO.hGetContents h +hGetContents h = do + hSetEncoding h utf8_bom + catch (IO.hGetContents h) $ \e -> + case ioe_type e of + InvalidArgument -> do + hSetEncoding h localeEncoding + IO.hGetContents h + _ -> throwIO e encodePath :: FilePath -> FilePath decodeArg :: String -> String diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs index 84bf95dfb..211f8946b 100644 --- a/src/Text/Pandoc/Writers/Docx.hs +++ b/src/Text/Pandoc/Writers/Docx.hs @@ -33,8 +33,7 @@ import System.FilePath ( (</>) ) import qualified Data.ByteString.Lazy as B import qualified Data.Map as M import Data.ByteString.Lazy.UTF8 ( fromString, toString ) -import Text.Pandoc.UTF8 as UTF8 -import System.IO ( stderr ) +import System.IO ( stderr, hPutStrLn ) import Codec.Archive.Zip import Data.Time.Clock.POSIX import Paths_pandoc ( getDataFileName ) @@ -663,7 +662,7 @@ inlineToOpenXML opts (Image alt (src, tit)) = do , mknode "wp:docPr" [("descr",tit),("id","1"),("name","Picture")] () , graphic ] ] else do - liftIO $ UTF8.hPutStrLn stderr $ + liftIO $ hPutStrLn stderr $ "Could not find image `" ++ src ++ "', skipping..." inlinesToOpenXML opts alt |