From 10c471907693aac3e01e9550ce203834ff367de1 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 3 Sep 2021 21:50:30 -0700 Subject: RTF reader: if doc begins with {\rtf1 ... } only parse its contents. Some documents seem to have non-RTF (e.g. XML) material after the `{\rtf1 ... }` group. --- src/Text/Pandoc/Readers/RTF.hs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/RTF.hs b/src/Text/Pandoc/Readers/RTF.hs index 5e5799b49..e577ee70b 100644 --- a/src/Text/Pandoc/Readers/RTF.hs +++ b/src/Text/Pandoc/Readers/RTF.hs @@ -204,7 +204,13 @@ parseRTF = do skipMany nl toks <- many tok -- return $! traceShowId toks - bs <- (foldM processTok mempty toks >>= emitBlocks) + bs <- (case toks of + -- if we start with {\rtf1...}, parse that and ignore + -- what follows (which in certain cases can be non-RTF content) + tok@(Tok _ (Grouped (Tok _ (ControlWord "rtf" (Just 1)) : _))) : _ + -> foldM processTok mempty [tok] + _ -> foldM processTok mempty toks) + >>= emitBlocks unclosed <- closeContainers let doc = B.doc $ bs <> unclosed kvs <- sMetadata <$> getState -- cgit v1.2.3