diff options
-rw-r--r-- | Codec/Binary/UTF8/String.hs | 97 | ||||
-rw-r--r-- | INSTALL | 5 | ||||
-rw-r--r-- | Main.hs | 9 | ||||
-rw-r--r-- | Setup.hs | 2 | ||||
-rw-r--r-- | System/IO/UTF8.hs | 118 | ||||
-rw-r--r-- | Text/Pandoc/ODT.hs | 6 | ||||
-rw-r--r-- | Text/Pandoc/PDF.hs | 5 | ||||
-rw-r--r-- | Text/Pandoc/Shared.hs | 4 | ||||
-rw-r--r-- | Text/Pandoc/TH.hs | 5 | ||||
-rw-r--r-- | Text/Pandoc/UTF8.hs | 76 | ||||
-rw-r--r-- | debian/copyright | 54 | ||||
-rw-r--r-- | pandoc.cabal | 19 |
12 files changed, 246 insertions, 154 deletions
diff --git a/Codec/Binary/UTF8/String.hs b/Codec/Binary/UTF8/String.hs new file mode 100644 index 000000000..27c003f00 --- /dev/null +++ b/Codec/Binary/UTF8/String.hs @@ -0,0 +1,97 @@ +-- +-- | +-- Module : Codec.Binary.UTF8.String +-- Copyright : (c) Eric Mertens 2007 +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer: emertens@galois.com +-- Stability : experimental +-- Portability : portable +-- +-- Support for encoding UTF8 Strings to and from @[Word8]@ +-- + +module Codec.Binary.UTF8.String ( + encode + , decode + , encodeString + , decodeString + ) where + +import Data.Word (Word8) +import Data.Bits ((.|.),(.&.),shiftL,shiftR) +import Data.Char (chr,ord) + +default(Int) + +-- | Encode a string using 'encode' and store the result in a 'String'. +encodeString :: String -> String +encodeString xs = map (toEnum . fromEnum) (encode xs) + +-- | Decode a string using 'decode' using a 'String' as input. +-- | This is not safe but it is necessary if UTF-8 encoded text +-- | has been loaded into a 'String' prior to being decoded. +decodeString :: String -> String +decodeString xs = decode (map (toEnum . fromEnum) xs) + +replacement_character :: Char +replacement_character = '\xfffd' + +-- | Encode a Haskell String to a list of Word8 values, in UTF8 format. +encode :: String -> [Word8] +encode = concatMap (map fromIntegral . go . ord) + where + go oc + | oc <= 0x7f = [oc] + + | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) + , 0x80 + oc .&. 0x3f + ] + + | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) + , 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ] + | otherwise = [ 0xf0 + (oc `shiftR` 18) + , 0x80 + ((oc `shiftR` 12) .&. 0x3f) + , 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ] + +-- +-- | Decode a UTF8 string packed into a list of Word8 values, directly to String +-- +decode :: [Word8] -> String +decode [ ] = "" +decode (c:cs) + | c < 0x80 = chr (fromEnum c) : decode cs + | c < 0xc0 = replacement_character : decode cs + | c < 0xe0 = multi1 + | c < 0xf0 = multi_byte 2 0xf 0x800 + | c < 0xf8 = multi_byte 3 0x7 0x10000 + | c < 0xfc = multi_byte 4 0x3 0x200000 + | c < 0xfe = multi_byte 5 0x1 0x4000000 + | otherwise = replacement_character : decode cs + where + multi1 = case cs of + c1 : ds | c1 .&. 0xc0 == 0x80 -> + let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f) + in if d >= 0x000080 then toEnum d : decode ds + else replacement_character : decode ds + _ -> replacement_character : decode cs + + multi_byte :: Int -> Word8 -> Int -> [Char] + multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask)) + where + aux 0 rs acc + | overlong <= acc && acc <= 0x10ffff && + (acc < 0xd800 || 0xdfff < acc) && + (acc < 0xfffe || 0xffff < acc) = chr acc : decode rs + | otherwise = replacement_character : decode rs + + aux n (r:rs) acc + | r .&. 0xc0 == 0x80 = aux (n-1) rs + $ shiftL acc 6 .|. fromEnum (r .&. 0x3f) + + aux _ rs _ = replacement_character : decode rs + @@ -24,10 +24,6 @@ GHC and the required libraries using `apt-get`: sudo apt-get install ghc6 libghc6-xhtml-dev libghc6-mtl-dev libghc6-network-dev -Pandoc will use the [utf8-string] library if it is installed; otherwise, it -will use its own internal module for UTF-8 I/O. The utf8-string library is -not a required dependency, but it may improve performance slightly. - [GHC]: http://www.haskell.org/ghc/ [GHC Download]: http://www.haskell.org/ghc/download.html [Cabal]: http://www.haskell.org/cabal/ @@ -36,7 +32,6 @@ not a required dependency, but it may improve performance slightly. [Fink]: http://finkproject.org [Ubuntu]: http://www.ubuntu.com [debian]: http://www.debian.org/ -[utf8-string]: http://hackage.haskell.org/cgi-bin/hackage-scripts/package/utf8-string Getting the source ------------------ @@ -43,11 +43,7 @@ import Data.Maybe ( fromMaybe ) import Data.Char ( toLower ) import Prelude hiding ( putStrLn, writeFile, readFile, getContents ) import System.IO ( stdout, stderr ) -#ifdef _UTF8STRING import System.IO.UTF8 -#else -import Text.Pandoc.UTF8 -#endif #ifdef _CITEPROC import Text.CSL import Text.Pandoc.Biblio @@ -61,11 +57,6 @@ copyrightMessage = "\nCopyright (C) 2006-7 John MacFarlane\n" ++ compileInfo :: String compileInfo = -#ifdef _UTF8STRING - " +utf8-string" ++ -#else - " -utf8-string" ++ -#endif #ifdef _CITEPROC " +citeproc" ++ #else @@ -17,7 +17,7 @@ main = defaultMainWithHooks $ -- | Run test suite. runTestSuite _ _ _ _ = do - inDirectory "tests" $ runCommand "runhaskell RunTests.hs" >>= waitForProcess + inDirectory "tests" $ runCommand "runhaskell -i.. RunTests.hs" >>= waitForProcess return () -- | If reference.odt needs rebuilding, build it. diff --git a/System/IO/UTF8.hs b/System/IO/UTF8.hs new file mode 100644 index 000000000..d0af4c38e --- /dev/null +++ b/System/IO/UTF8.hs @@ -0,0 +1,118 @@ +----------------------------------------------------------------------------- +-- | +-- Module : System.IO.UTF8 +-- Copyright : (c) Eric Mertens 2007 +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer: emertens@galois.com +-- Stability : experimental +-- Portability : portable +-- +-- String IO preserving UTF8 encoding. +-- + +module System.IO.UTF8 ( + print + , putStr + , putStrLn + , getLine + , readLn + , readFile + , writeFile + , appendFile + , getContents + , hGetLine + , hGetContents + , hPutStr + , hPutStrLn + ) where + +import Control.Monad (liftM) +import Data.Char (ord, chr) +import Data.Word (Word8) +import Prelude (String, ($), (=<<), (>>=), (.), map, toEnum, fromEnum, Read, + Show(..)) +import System.IO (Handle, IO, FilePath) +import qualified System.IO as IO + +import Codec.Binary.UTF8.String (encode, decode) + + +-- | Encode a string in UTF8 form. +encodeString :: String -> String +encodeString xs = bytesToString (encode xs) + +-- | Decode a string from UTF8 +decodeString :: String -> String +decodeString xs = decode (stringToBytes xs) + +-- | Convert a list of bytes to a String +bytesToString :: [Word8] -> String +bytesToString xs = map (chr . fromEnum) xs + +-- | String to list of bytes. +stringToBytes :: String -> [Word8] +stringToBytes xs = map (toEnum . ord) xs + +-- | The 'print' function outputs a value of any printable type to the +-- standard output device. This function differs from the +-- System.IO.print in that it preserves any UTF8 encoding of the shown value. +-- +print :: Show a => a -> IO () +print x = putStrLn (show x) + +-- | Write a UTF8 string to the standard output device +putStr :: String -> IO () +putStr x = IO.putStr (encodeString x) + +-- | The same as 'putStr', but adds a newline character. +putStrLn :: String -> IO () +putStrLn x = IO.putStrLn (encodeString x) + +-- | Read a UTF8 line from the standard input device +getLine :: IO String +getLine = liftM decodeString IO.getLine + +-- | The 'readLn' function combines 'getLine' and 'readIO', preserving UTF8 +readLn :: Read a => IO a +readLn = IO.readIO =<< getLine + +-- | The 'readFile' function reads a file and +-- returns the contents of the file as a UTF8 string. +-- The file is read lazily, on demand, as with 'getContents'. +readFile :: FilePath -> IO String +readFile n = liftM decodeString (IO.openBinaryFile n IO.ReadMode >>= + IO.hGetContents) + +-- | The computation 'writeFile' @file str@ function writes the UTF8 string @str@, +-- to the file @file@. +writeFile :: FilePath -> String -> IO () +writeFile n c = IO.withBinaryFile n IO.WriteMode $ \ h -> + IO.hPutStr h $ encodeString c + +-- | The computation 'appendFile' @file str@ function appends the UTF8 string @str@, +-- to the file @file@. +appendFile :: FilePath -> String -> IO () +appendFile n c = IO.withBinaryFile n IO.AppendMode $ \h -> + IO.hPutStr h $ encodeString c + +-- | Read a UTF8 line from a Handle +hGetLine :: Handle -> IO String +hGetLine h = liftM decodeString $ IO.hGetLine h + +-- | Lazily read a UTF8 string from a Handle +hGetContents :: Handle -> IO String +hGetContents h = liftM decodeString (IO.hGetContents h) + +-- | Write a UTF8 string to a Handle. +hPutStr :: Handle -> String -> IO () +hPutStr h s = IO.hPutStr h (encodeString s) + +-- | Write a UTF8 string to a Handle, appending a newline. +hPutStrLn :: Handle -> String -> IO () +hPutStrLn h s = IO.hPutStrLn h (encodeString s) + +-- | Lazily read stdin as a UTF8 string. +getContents :: IO String +getContents = liftM decodeString IO.getContents + diff --git a/Text/Pandoc/ODT.hs b/Text/Pandoc/ODT.hs index 763625e24..8c3b1b45f 100644 --- a/Text/Pandoc/ODT.hs +++ b/Text/Pandoc/ODT.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE TemplateHaskell, CPP #-} +{-# LANGUAGE TemplateHaskell #-} {- Copyright (C) 2008 John MacFarlane <jgm@berkeley.edu> @@ -44,11 +44,7 @@ import qualified Data.ByteString as B ( writeFile, pack ) import Data.ByteString.Internal ( c2w ) import Prelude hiding ( writeFile, readFile ) import System.IO ( stderr ) -#ifdef _UTF8STRING import System.IO.UTF8 -#else -import Text.Pandoc.UTF8 -#endif -- | Produce an ODT file from OpenDocument XML. saveOpenDocumentAsODT :: FilePath -- ^ Pathname of ODT file to be produced. diff --git a/Text/Pandoc/PDF.hs b/Text/Pandoc/PDF.hs index 4257e4712..1e2d5e9b5 100644 --- a/Text/Pandoc/PDF.hs +++ b/Text/Pandoc/PDF.hs @@ -1,4 +1,3 @@ -{-# LANGUAGE CPP #-} {- Copyright (C) 2008 John MacFarlane <jgm@berkeley.edu> @@ -38,11 +37,7 @@ import System.Environment ( getEnvironment ) import Text.Pandoc.Shared ( withTempDir ) import Prelude hiding ( writeFile, readFile, putStrLn ) import System.IO ( stderr, openFile, IOMode (..), hClose ) -#ifdef _UTF8STRING import System.IO.UTF8 -#else -import Text.Pandoc.UTF8 -#endif -- | Produce an PDF file from LaTeX. saveLaTeXAsPDF :: FilePath -- ^ Pathname of PDF file to be produced. diff --git a/Text/Pandoc/Shared.hs b/Text/Pandoc/Shared.hs index 38c1cf6b4..2c53ffa7a 100644 --- a/Text/Pandoc/Shared.hs +++ b/Text/Pandoc/Shared.hs @@ -118,11 +118,7 @@ import System.FilePath ( (</>), (<.>) ) import System.IO.Error ( catch, ioError, isAlreadyExistsError ) import System.Directory import Prelude hiding ( putStrLn, writeFile, readFile, getContents ) -#ifdef _UTF8STRING import System.IO.UTF8 -#else -import Text.Pandoc.UTF8 -#endif -- -- List processing diff --git a/Text/Pandoc/TH.hs b/Text/Pandoc/TH.hs index 376dfa00d..dfd6be28b 100644 --- a/Text/Pandoc/TH.hs +++ b/Text/Pandoc/TH.hs @@ -1,4 +1,3 @@ -{-# LANGUAGE CPP #-} {-# OPTIONS_GHC -fno-warn-orphans #-} {- Copyright (C) 2008 John MacFarlane <jgm@berkeley.edu> @@ -39,11 +38,7 @@ import Language.Haskell.TH.Syntax (Lift (..)) import qualified Data.ByteString as B import Data.ByteString.Internal ( w2c ) import Prelude hiding ( readFile ) -#ifdef _UTF8STRING import System.IO.UTF8 -#else -import Text.Pandoc.UTF8 -#endif -- | Insert contents of text file into a template. contentsOf :: FilePath -> ExpQ diff --git a/Text/Pandoc/UTF8.hs b/Text/Pandoc/UTF8.hs deleted file mode 100644 index f8d041db7..000000000 --- a/Text/Pandoc/UTF8.hs +++ /dev/null @@ -1,76 +0,0 @@ --- | Functions for IO using UTF-8 encoding. --- --- The basic encoding and decoding functions are taken from --- <http://www.cse.ogi.edu/~hallgren/Talks/LHiH/base/lib/UTF8.hs>. --- (c) 2003, OGI School of Science & Engineering, Oregon Health and --- Science University. --- --- From the Char module supplied with HBC. --- Modified by Martin Norbaeck to pass illegal UTF-8 sequences unchanged. --- Modified by John MacFarlane to use [Word8] and export IO functions. - -module Text.Pandoc.UTF8 ( - putStrLn - , putStr - , hPutStrLn - , hPutStr - , getContents - , readFile - , writeFile - ) where -import Data.Word -import System.IO ( Handle ) -import qualified Data.ByteString.Lazy as BS -import Prelude hiding ( putStrLn, putStr, getContents, readFile, writeFile ) - -putStrLn :: String -> IO () -putStrLn = BS.putStrLn . BS.pack . toUTF8 - -putStr :: String -> IO () -putStr = BS.putStr . BS.pack . toUTF8 - -hPutStrLn :: Handle -> String -> IO () -hPutStrLn h = BS.hPut h . BS.pack . toUTF8 . (++ "\n") - -hPutStr :: Handle -> String -> IO () -hPutStr h = BS.hPut h . BS.pack . toUTF8 - -readFile :: FilePath -> IO String -readFile p = BS.readFile p >>= return . fromUTF8 . BS.unpack - -writeFile :: FilePath -> String -> IO () -writeFile p = BS.writeFile p . BS.pack . toUTF8 - -getContents :: IO String -getContents = BS.getContents >>= return . fromUTF8 . BS.unpack - --- | Take a list of bytes in UTF-8 encoding and decode it into a Unicode string. -fromUTF8 :: [Word8] -> String -fromUTF8 [] = "" -fromUTF8 (0xef : 0xbb : 0xbf :cs) = fromUTF8 cs -- skip BOM (byte order marker) -fromUTF8 (c:c':cs) | 0xc0 <= c && c <= 0xdf && - 0x80 <= c' && c' <= 0xbf = - toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : fromUTF8 cs -fromUTF8 (c:c':c'':cs) | 0xe0 <= c && c <= 0xef && - 0x80 <= c' && c' <= 0xbf && - 0x80 <= c'' && c'' <= 0xbf = - toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : fromUTF8 cs -fromUTF8 (c:cs) = toEnum (fromEnum c) : fromUTF8 cs - --- | Take a Unicode string and encode it as a list of bytes in UTF-8 encoding. -toUTF8 :: String -> [Word8] -toUTF8 "" = [] -toUTF8 (c:cs) = - if c > '\x0000' && c < '\x0080' then - toEnum (fromEnum c) : toUTF8 cs - else if c < toEnum 0x0800 then - let i = fromEnum c - in toEnum (0xc0 + i `div` 0x40) : - toEnum (0x80 + i `mod` 0x40) : - toUTF8 cs - else - let i = fromEnum c - in toEnum (0xe0 + i `div` 0x1000) : - toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) : - toEnum (0x80 + i `mod` 0x40) : - toUTF8 cs diff --git a/debian/copyright b/debian/copyright index 533981520..444f73796 100644 --- a/debian/copyright +++ b/debian/copyright @@ -58,42 +58,36 @@ by Eric A. Meyer Released under an explicit Public Domain License ---------------------------------------------------------------------- -UTF8.hs -Copyright (c) 2003, OGI School of Science & Engineering, Oregon Health & -Science University, All rights reserved. -Modified by Martin Norbäck, to pass illegal utf-8 sequences through unchanged. -Modified 2006-8 John MacFarlane. +System/IO/UTF8.hs and Codec/Binary/UTF8/String.hs +from the utf8-string package on HackageDB +Copyright (c) 2007, Galois Inc. +All rights reserved. Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -- Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -- Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - -- Neither the name of OGI or OHSU nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Galois Inc. nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY Galois Inc. ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL Galois Inc. BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ---------------------------------------------------------------------- Text/XML/Light/* +from the xml package on HackageDB (c) 2007 Galois Inc. All rights reserved. diff --git a/pandoc.cabal b/pandoc.cabal index d52bb0d63..a58eb1ddf 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -134,9 +134,6 @@ Flag executable Flag library Description: Build the pandoc library. Default: True -Flag utf8-string - Description: Use utf8-string library for UTF-8 I/O. - Default: True Flag citeproc Description: Compile in support for citeproc-hs bibliographic formatting. Default: False @@ -149,11 +146,6 @@ Library if flag(highlighting) Build-depends: highlighting-kate cpp-options: -D_HIGHLIGHTING - if flag(utf8-string) - Build-depends: utf8-string - cpp-options: -D_UTF8STRING - else - Other-Modules: Text.Pandoc.UTF8 if flag(citeproc) Build-depends: citeproc-hs Exposed-Modules: Text.Pandoc.Biblio @@ -190,12 +182,16 @@ Library Text.Pandoc.Writers.S5 Other-Modules: Text.Pandoc.XML, Text.Pandoc.TH, + -- from xml package Text.XML.Light, Text.XML.Light.Types, Text.XML.Light.Output, Text.XML.Light.Input, Text.XML.Light.Proc, - Text.XML.Light.Cursor + Text.XML.Light.Cursor, + -- from utf8-string package + System.IO.UTF8, + Codec.Binary.UTF8.String Extensions: CPP, TemplateHaskell, FlexibleInstances Ghc-Options: -O2 -Wall -threaded Ghc-Prof-Options: -auto-all @@ -214,11 +210,6 @@ Executable pandoc if flag(highlighting) cpp-options: -D_HIGHLIGHTING - if flag(utf8-string) - Build-depends: utf8-string - cpp-options: -D_UTF8STRING - else - Other-Modules: Text.Pandoc.UTF8 if flag(citeproc) Build-depends: citeproc-hs cpp-options: -D_CITEPROC |