aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/RTF.hs
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-09-08 09:30:58 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2021-09-08 09:30:58 -0700
commit0b1c5a87da1117a4dc4a04e308c9edbdc0ea1ec6 (patch)
treed0083537972e3370921dccd20786918d96baa0ed /src/Text/Pandoc/Readers/RTF.hs
parentddfa7b2a630bd123c7b6291769c6ae1fe347aff4 (diff)
downloadpandoc-0b1c5a87da1117a4dc4a04e308c9edbdc0ea1ec6.tar.gz
RTF reader: support `\binN` for binary image data.
Diffstat (limited to 'src/Text/Pandoc/Readers/RTF.hs')
-rw-r--r--src/Text/Pandoc/Readers/RTF.hs33
1 files changed, 22 insertions, 11 deletions
diff --git a/src/Text/Pandoc/Readers/RTF.hs b/src/Text/Pandoc/Readers/RTF.hs
index 4eeab1312..72d1f51d2 100644
--- a/src/Text/Pandoc/Readers/RTF.hs
+++ b/src/Text/Pandoc/Readers/RTF.hs
@@ -31,7 +31,7 @@ import Text.Pandoc.Definition
import Text.Pandoc.Options
import Text.Pandoc.Parsing
import Text.Pandoc.Shared (safeRead, tshow)
-import Data.Char (isAlphaNum, chr, digitToInt, isAscii, isLetter, isSpace)
+import Data.Char (isAlphaNum, chr, digitToInt, isAscii, isLetter, isSpace, ord)
import qualified Data.ByteString.Lazy as BL
import Data.Digest.Pure.SHA (sha1, showDigest)
import Data.Maybe (mapMaybe, fromMaybe)
@@ -223,6 +223,7 @@ data TokContents =
ControlWord Text (Maybe Int)
| ControlSymbol Char
| UnformattedText Text
+ | BinData BL.ByteString
| HexVal Word8
| Grouped [Tok]
deriving (Show, Eq)
@@ -234,9 +235,22 @@ tok = do
where
controlThing = do
char '\\' *>
- ( (ControlWord <$> letterSequence <*> (parameter <* optional delimChar))
+ ( binData
+ <|> (ControlWord <$> letterSequence <*> (parameter <* optional delimChar))
<|> (HexVal <$> hexVal)
<|> (ControlSymbol <$> anyChar) )
+ binData = try $ do
+ string "bin" <* notFollowedBy letter
+ n <- fromMaybe 0 <$> parameter
+ spaces
+ -- NOTE: We assume here that if the document contains binary
+ -- data, it will not be valid UTF-8 and hence it will have been
+ -- read as latin1, so we can recover the data in the following
+ -- way. This is probably not completely reliable, but I don't
+ -- know if we can do better without making this reader take
+ -- a ByteString input.
+ dat <- BL.pack . map (fromIntegral . ord) <$> count n anyChar
+ return $ BinData dat
parameter = do
hyph <- string "-" <|> pure ""
rest <- many digit
@@ -860,14 +874,10 @@ handlePict :: PandocMonad m => [Tok] -> RTFParser m ()
handlePict toks = do
let pict = foldl' getPictData def toks
let altText = "image"
- let binToWord = T.foldl' (\acc x -> acc * 2 + fromIntegral (digitToInt x)) 0
- let isBinaryDigit '0' = True
- isBinaryDigit '1' = True
- isBinaryDigit _ = False
- let bytes = BL.pack $
- if picBinary pict && T.all isBinaryDigit (picData pict)
- then map binToWord $ T.chunksOf 8 $ picData pict
- else map hexToWord $ T.chunksOf 2 $ picData pict
+ let bytes =
+ if picBinary pict
+ then picBytes pict
+ else BL.pack $ map hexToWord $ T.chunksOf 2 $ picData pict
let (mimetype, ext) =
case picType pict of
Just Emfblip -> (Just "image/x-emf", ".emf")
@@ -894,7 +904,8 @@ handlePict toks = do
ControlWord "pich" (Just h) -> pict{ picHeight = Just h }
ControlWord "picwgoal" (Just w) -> pict{ picWidthGoal = Just w }
ControlWord "pichgoal" (Just h) -> pict{ picHeightGoal = Just h }
- ControlWord "bin" _ -> pict{ picBinary = True }
+ BinData d | not (BL.null d)
+ -> pict{ picBinary = True, picBytes = picBytes pict <> d }
UnformattedText t -> pict{ picData = t }
_ -> pict