From 3bf8012bf6e6965a68de76d5bb46782086393da7 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 8 Jan 2014 19:33:14 -0800 Subject: Text.Pandoc.ImageSize: Parse EXIF format JPGs. Note: For now we just assign them all 72 dpi. It wasn't clear to me how to extract the resolution information. At least the aspect ratio will be right, and 72 dpi is the most common setting. Closes #976. --- src/Text/Pandoc/ImageSize.hs | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'src/Text/Pandoc/ImageSize.hs') diff --git a/src/Text/Pandoc/ImageSize.hs b/src/Text/Pandoc/ImageSize.hs index 9b0850efb..e2a8b8283 100644 --- a/src/Text/Pandoc/ImageSize.hs +++ b/src/Text/Pandoc/ImageSize.hs @@ -53,7 +53,8 @@ imageType :: ByteString -> Maybe ImageType imageType img = case B.take 4 img of "\x89\x50\x4e\x47" -> return Png "\x47\x49\x46\x38" -> return Gif - "\xff\xd8\xff\xe0" -> return Jpeg + "\xff\xd8\xff\xe0" -> return Jpeg -- JFIF + "\xff\xd8\xff\xe1" -> return Jpeg -- Exif "%PDF" -> return Pdf "%!PS" | (B.take 4 $ B.drop 1 $ B.dropWhile (/=' ') img) == "EPSF" @@ -139,8 +140,14 @@ gifSize img = do jpegSize :: ByteString -> Maybe ImageSize jpegSize img = do let (hdr, rest) = B.splitAt 4 img - guard $ hdr == "\xff\xd8\xff\xe0" guard $ B.length rest >= 14 + case hdr of + "\xff\xd8\xff\xe0" -> jfifSize rest + "\xff\xd8\xff\xe1" -> exifSize rest + _ -> mzero + +jfifSize :: ByteString -> Maybe ImageSize +jfifSize rest = do let [dpiDensity,dpix1,dpix2,dpiy1,dpiy2] = map fromIntegral $ unpack $ B.take 5 $ B.drop 9 $ rest let factor = case dpiDensity of @@ -149,11 +156,11 @@ jpegSize img = do _ -> const 72 let dpix = factor (shift dpix1 8 + dpix2) let dpiy = factor (shift dpiy1 8 + dpiy2) - (w,h) <- findJpegSize rest + (w,h) <- findJfifSize rest return $ ImageSize { pxX = w, pxY = h, dpiX = dpix, dpiY = dpiy } -findJpegSize :: ByteString -> Maybe (Integer,Integer) -findJpegSize bs = do +findJfifSize :: ByteString -> Maybe (Integer,Integer) +findJfifSize bs = do let bs' = B.dropWhile (=='\xff') $ B.dropWhile (/='\xff') bs case B.uncons bs' of Just (c,bs'') | c >= '\xc0' && c <= '\xc3' -> do @@ -165,8 +172,23 @@ findJpegSize bs = do [c1,c2] -> do let len = shift c1 8 + c2 -- skip variables - findJpegSize $ B.drop len bs'' + findJfifSize $ B.drop len bs'' _ -> fail "JPEG parse error" Nothing -> fail "Did not find length record" +exifSize :: ByteString -> Maybe ImageSize +exifSize rest = do + let bs' = B.takeWhile (/='\xff') $ B.drop 8 rest -- exif data + let (_,bs'') = B.breakSubstring "\xa0\x02" bs' -- width + let rawWidth = B.take 2 $ B.drop 10 bs'' + let (_,bs''') = B.breakSubstring "\xa0\x03" bs' -- height + let rawHeight = B.take 2 $ B.drop 10 bs''' + let tonum bs = case map fromIntegral $ unpack bs of + [x,y] -> Just $ shift x 8 + y + _ -> Nothing + case (tonum rawWidth, tonum rawHeight) of + (Just w, Just h) -> + return $ ImageSize { pxX = w, pxY = h, dpiX = 72, dpiY = 72 } + _ -> fail "Could not determine exif image size" + -- some day, figure out how to parse dpi from exif -- cgit v1.2.3 From 5c8c380a7997156964a5402974f6f464233aab9b Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 9 Jan 2014 11:16:17 -0800 Subject: Better exif parsing, including image resolution. This introduces a dependency on binary >= 0.6, but we depend on binary >= 0.5 via zip-archive anyway. Closes #976. --- pandoc.cabal | 3 +- src/Text/Pandoc/ImageSize.hs | 225 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 212 insertions(+), 16 deletions(-) (limited to 'src/Text/Pandoc/ImageSize.hs') diff --git a/pandoc.cabal b/pandoc.cabal index a1a4c9b40..94d382c4f 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -232,7 +232,8 @@ Library attoparsec >= 0.10 && < 0.11, yaml >= 0.8.3 && < 0.9, vector >= 0.10 && < 0.11, - hslua >= 0.3 && < 0.4 + hslua >= 0.3 && < 0.4, + binary >= 0.6 && < 0.8 Build-Tools: alex, happy if flag(http-conduit) Build-Depends: http-conduit >= 1.9 && < 2.1, diff --git a/src/Text/Pandoc/ImageSize.hs b/src/Text/Pandoc/ImageSize.hs index e2a8b8283..467205220 100644 --- a/src/Text/Pandoc/ImageSize.hs +++ b/src/Text/Pandoc/ImageSize.hs @@ -32,9 +32,14 @@ module Text.Pandoc.ImageSize ( ImageType(..), imageType, imageSize, sizeInPixels, sizeInPoints ) where import Data.ByteString (ByteString, unpack) import qualified Data.ByteString.Char8 as B +import qualified Data.ByteString.Lazy as BL +import Control.Applicative import Control.Monad import Data.Bits +import Data.Binary +import Data.Binary.Get import Text.Pandoc.Shared (safeRead) +import qualified Data.Map as M -- quick and dirty functions to get image sizes -- algorithms borrowed from wwwis.pl @@ -143,7 +148,7 @@ jpegSize img = do guard $ B.length rest >= 14 case hdr of "\xff\xd8\xff\xe0" -> jfifSize rest - "\xff\xd8\xff\xe1" -> exifSize rest + "\xff\xd8\xff\xe1" -> exifSize $ B.takeWhile (/= '\xff') rest _ -> mzero jfifSize :: ByteString -> Maybe ImageSize @@ -177,18 +182,208 @@ findJfifSize bs = do Nothing -> fail "Did not find length record" exifSize :: ByteString -> Maybe ImageSize -exifSize rest = do - let bs' = B.takeWhile (/='\xff') $ B.drop 8 rest -- exif data - let (_,bs'') = B.breakSubstring "\xa0\x02" bs' -- width - let rawWidth = B.take 2 $ B.drop 10 bs'' - let (_,bs''') = B.breakSubstring "\xa0\x03" bs' -- height - let rawHeight = B.take 2 $ B.drop 10 bs''' - let tonum bs = case map fromIntegral $ unpack bs of - [x,y] -> Just $ shift x 8 + y - _ -> Nothing - case (tonum rawWidth, tonum rawHeight) of - (Just w, Just h) -> - return $ ImageSize { pxX = w, pxY = h, dpiX = 72, dpiY = 72 } - _ -> fail "Could not determine exif image size" - -- some day, figure out how to parse dpi from exif +exifSize = -- runGet ((Just <$> exifHeader) `mplus` return Nothing) . + runGet (Just <$> exifHeader) . + BL.fromChunks . (:[]) +exifHeader :: Get ImageSize +exifHeader = do + _app1DataSize <- getWord16be + exifHdr <- getWord32be + unless (exifHdr == 0x45786966) $ fail "Did not find exif header" + zeros <- getWord16be + unless (zeros == 0) $ fail "Expected zeros after exif header" + -- beginning of tiff header -- we read whole thing to use + -- in getting data from offsets: + tiffHeader <- lookAhead getRemainingLazyByteString + byteAlign <- getWord16be + let bigEndian = byteAlign == 0x4d4d + let (getWord16, getWord32, getWord64) = + if bigEndian + then (getWord16be, getWord32be, getWord64be) + else (getWord16le, getWord32le, getWord64le) + let getRational = do + num <- getWord32 + den <- getWord32 + return $ fromIntegral num / fromIntegral den + tagmark <- getWord16 + unless (tagmark == 0x002a) $ fail "Failed alignment sanity check" + ifdOffset <- getWord32 + skip (fromIntegral ifdOffset - 8) -- skip to IDF + numentries <- getWord16 + let ifdEntry = do + tag <- getWord16 >>= \t -> + maybe (fail $ "Unknown tag type " ++ show t) return + (M.lookup t tagTypeTable) + dataFormat <- getWord16 + numComponents <- getWord32 + (fmt, bytesPerComponent) <- + case dataFormat of + 1 -> return (UnsignedByte . runGet getWord8, 1) + 2 -> return (AsciiString, 1) + 3 -> return (UnsignedShort . runGet getWord16, 2) + 4 -> return (UnsignedLong . runGet getWord32, 4) + 5 -> return (UnsignedRational . runGet getRational, 8) + 6 -> return (SignedByte . runGet getWord8, 1) + 7 -> return (Undefined . runGet getWord8, 1) + 8 -> return (SignedShort . runGet getWord16, 2) + 9 -> return (SignedLong . runGet getWord32, 4) + 10 -> return (SignedRational . runGet getRational, 8) + 11 -> return (SingleFloat . runGet getWord32 {- TODO -}, 4) + 12 -> return (DoubleFloat . runGet getWord64 {- TODO -}, 8) + _ -> fail $ "Unknown data format " ++ show dataFormat + let totalBytes = fromIntegral $ numComponents * bytesPerComponent + payload <- if totalBytes <= 4 -- data is right here + then (fmt . BL.fromChunks . (:[])) <$> + (getByteString totalBytes <* + skip (4 - totalBytes)) + else do -- get data from offset + offs <- getWord32 + return $ fmt $ BL.take (fromIntegral totalBytes) $ + BL.drop (fromIntegral offs) tiffHeader + return (tag, payload) + entries <- sequence $ replicate (fromIntegral numentries) ifdEntry + subentries <- case lookup ExifOffset entries of + Just (UnsignedLong offset) -> do + pos <- bytesRead + skip (fromIntegral offset - (fromIntegral pos - 8)) + numsubentries <- getWord16 + sequence $ + replicate (fromIntegral numsubentries) ifdEntry + _ -> return [] + let allentries = entries ++ subentries + (width, height) <- case (lookup ExifImageWidth allentries, + lookup ExifImageHeight allentries) of + (Just (UnsignedLong w), Just (UnsignedLong h)) -> + return (fromIntegral w, fromIntegral h) + _ -> fail "Could not determine image width, height" + let resfactor = case lookup ResolutionUnit allentries of + Just (UnsignedShort 1) -> (100 / 254) + _ -> 1 + let xres = maybe 72 (\(UnsignedRational x) -> floor $ x * resfactor) + $ lookup XResolution allentries + let yres = maybe 72 (\(UnsignedRational x) -> floor $ x * resfactor) + $ lookup YResolution allentries + return $ ImageSize{ + pxX = width + , pxY = height + , dpiX = xres + , dpiY = yres } + +data DataFormat = UnsignedByte Word8 + | AsciiString BL.ByteString + | UnsignedShort Word16 + | UnsignedLong Word32 + | UnsignedRational Rational + | SignedByte Word8 + | Undefined Word8 + | SignedShort Word16 + | SignedLong Word32 + | SignedRational Rational + | SingleFloat Word32 + | DoubleFloat Word64 + deriving (Show) + +data TagType = ImageDescription + | Make + | Model + | Orientation + | XResolution + | YResolution + | ResolutionUnit + | Software + | DateTime + | WhitePoint + | PrimaryChromaticities + | YCbCrCoefficients + | YCbCrPositioning + | ReferenceBlackWhite + | Copyright + | ExifOffset + | ExposureTime + | FNumber + | ExposureProgram + | ISOSpeedRatings + | ExifVersion + | DateTimeOriginal + | DateTimeDigitized + | ComponentConfiguration + | CompressedBitsPerPixel + | ShutterSpeedValue + | ApertureValue + | BrightnessValue + | ExposureBiasValue + | MaxApertureValue + | SubjectDistance + | MeteringMode + | LightSource + | Flash + | FocalLength + | MakerNote + | UserComment + | FlashPixVersion + | ColorSpace + | ExifImageWidth + | ExifImageHeight + | RelatedSoundFile + | ExifInteroperabilityOffset + | FocalPlaneXResolution + | FocalPlaneYResolution + | FocalPlaneResolutionUnit + | SensingMethod + | FileSource + | SceneType + deriving (Show, Eq, Ord) + +tagTypeTable :: M.Map Word16 TagType +tagTypeTable = M.fromList + [ (0x010e, ImageDescription) + , (0x010f, Make) + , (0x0110, Model) + , (0x0112, Orientation) + , (0x011a, XResolution) + , (0x011b, YResolution) + , (0x0128, ResolutionUnit) + , (0x0131, Software) + , (0x0132, DateTime) + , (0x013e, WhitePoint) + , (0x013f, PrimaryChromaticities) + , (0x0211, YCbCrCoefficients) + , (0x0213, YCbCrPositioning) + , (0x0214, ReferenceBlackWhite) + , (0x8298, Copyright) + , (0x8769, ExifOffset) + , (0x829a, ExposureTime) + , (0x829d, FNumber) + , (0x8822, ExposureProgram) + , (0x8827, ISOSpeedRatings) + , (0x9000, ExifVersion) + , (0x9003, DateTimeOriginal) + , (0x9004, DateTimeDigitized) + , (0x9101, ComponentConfiguration) + , (0x9102, CompressedBitsPerPixel) + , (0x9201, ShutterSpeedValue) + , (0x9202, ApertureValue) + , (0x9203, BrightnessValue) + , (0x9204, ExposureBiasValue) + , (0x9205, MaxApertureValue) + , (0x9206, SubjectDistance) + , (0x9207, MeteringMode) + , (0x9208, LightSource) + , (0x9209, Flash) + , (0x920a, FocalLength) + , (0x927c, MakerNote) + , (0x9286, UserComment) + , (0xa000, FlashPixVersion) + , (0xa001, ColorSpace) + , (0xa002, ExifImageWidth) + , (0xa003, ExifImageHeight) + , (0xa004, RelatedSoundFile) + , (0xa005, ExifInteroperabilityOffset) + , (0xa20e, FocalPlaneXResolution) + , (0xa20f, FocalPlaneYResolution) + , (0xa210, FocalPlaneResolutionUnit) + , (0xa217, SensingMethod) + , (0xa300, FileSource) + , (0xa301, SceneType) + ] -- cgit v1.2.3 From b4b16d57865b1d951be79f04f8ad41fb70544077 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 9 Jan 2014 22:50:51 -0800 Subject: Minor improvement to exif parser. --- src/Text/Pandoc/ImageSize.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/Text/Pandoc/ImageSize.hs') diff --git a/src/Text/Pandoc/ImageSize.hs b/src/Text/Pandoc/ImageSize.hs index 467205220..d1aacff1c 100644 --- a/src/Text/Pandoc/ImageSize.hs +++ b/src/Text/Pandoc/ImageSize.hs @@ -234,8 +234,8 @@ exifHeader = do _ -> fail $ "Unknown data format " ++ show dataFormat let totalBytes = fromIntegral $ numComponents * bytesPerComponent payload <- if totalBytes <= 4 -- data is right here - then (fmt . BL.fromChunks . (:[])) <$> - (getByteString totalBytes <* + then fmt <$> + (getLazyByteString (fromIntegral totalBytes) <* skip (4 - totalBytes)) else do -- get data from offset offs <- getWord32 -- cgit v1.2.3 From a1abb3eeea2321654a8450725ff6c0d1a18ee0c7 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Tue, 14 Jan 2014 10:12:33 -0800 Subject: Allow binary 0.5. Version bump to 1.12.3.1. --- changelog | 4 ++++ pandoc.cabal | 4 ++-- src/Text/Pandoc/ImageSize.hs | 9 ++++++--- 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'src/Text/Pandoc/ImageSize.hs') diff --git a/changelog b/changelog index 4cbc60388..fde6873ec 100644 --- a/changelog +++ b/changelog @@ -1,3 +1,7 @@ +pandoc (1.12.3.1) + + * Relaxed version constraint on binary, allowing the use of binary 0.5. + pandoc (1.12.3) * The `--bibliography` option now sets the `biblio-files` variable. diff --git a/pandoc.cabal b/pandoc.cabal index c8ff9738a..0199996be 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -1,5 +1,5 @@ Name: pandoc -Version: 1.12.3 +Version: 1.12.3.1 Cabal-Version: >= 1.10 Build-Type: Custom License: GPL @@ -233,7 +233,7 @@ Library yaml >= 0.8.3 && < 0.9, vector >= 0.10 && < 0.11, hslua >= 0.3 && < 0.4, - binary >= 0.6 && < 0.8 + binary >= 0.5 && < 0.8 Build-Tools: alex, happy if flag(http-conduit) Build-Depends: http-conduit >= 1.9 && < 2.1, diff --git a/src/Text/Pandoc/ImageSize.hs b/src/Text/Pandoc/ImageSize.hs index d1aacff1c..14575244d 100644 --- a/src/Text/Pandoc/ImageSize.hs +++ b/src/Text/Pandoc/ImageSize.hs @@ -182,9 +182,12 @@ findJfifSize bs = do Nothing -> fail "Did not find length record" exifSize :: ByteString -> Maybe ImageSize -exifSize = -- runGet ((Just <$> exifHeader) `mplus` return Nothing) . - runGet (Just <$> exifHeader) . - BL.fromChunks . (:[]) +exifSize = runGet (Just <$> exifHeader) . BL.fromChunks . (:[]) +-- NOTE: It would be nicer to do +-- runGet ((Just <$> exifHeader) <|> return Nothing) +-- which would prevent pandoc from raising an error when an exif header can't +-- be parsed. But we only get an Alternative instance for Get in binary 0.6, +-- and binary 0.5 ships with ghc 7.6. exifHeader :: Get ImageSize exifHeader = do -- cgit v1.2.3 From a333d9788e0f510f681ae1b5f0f246434ee15d62 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 24 Jan 2014 16:00:02 -0800 Subject: ImageSize: Avoid use of lookAhead, which is not in binary >= 0.6. Closes #1124. --- src/Text/Pandoc/ImageSize.hs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'src/Text/Pandoc/ImageSize.hs') diff --git a/src/Text/Pandoc/ImageSize.hs b/src/Text/Pandoc/ImageSize.hs index 14575244d..3c9623b3c 100644 --- a/src/Text/Pandoc/ImageSize.hs +++ b/src/Text/Pandoc/ImageSize.hs @@ -182,15 +182,16 @@ findJfifSize bs = do Nothing -> fail "Did not find length record" exifSize :: ByteString -> Maybe ImageSize -exifSize = runGet (Just <$> exifHeader) . BL.fromChunks . (:[]) +exifSize bs = runGet (Just <$> exifHeader bl) bl + where bl = BL.fromChunks [bs] -- NOTE: It would be nicer to do -- runGet ((Just <$> exifHeader) <|> return Nothing) -- which would prevent pandoc from raising an error when an exif header can't -- be parsed. But we only get an Alternative instance for Get in binary 0.6, -- and binary 0.5 ships with ghc 7.6. -exifHeader :: Get ImageSize -exifHeader = do +exifHeader :: BL.ByteString -> Get ImageSize +exifHeader hdr = do _app1DataSize <- getWord16be exifHdr <- getWord32be unless (exifHdr == 0x45786966) $ fail "Did not find exif header" @@ -198,7 +199,7 @@ exifHeader = do unless (zeros == 0) $ fail "Expected zeros after exif header" -- beginning of tiff header -- we read whole thing to use -- in getting data from offsets: - tiffHeader <- lookAhead getRemainingLazyByteString + let tiffHeader = BL.drop 8 hdr byteAlign <- getWord16be let bigEndian = byteAlign == 0x4d4d let (getWord16, getWord32, getWord64) = -- cgit v1.2.3