From 6543b05116ee58ef4de62f93dcafeb27617d83e6 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 30 Jul 2021 17:23:46 -0600 Subject: Add RTF reader. - `rtf` is now supported as an input format as well as output. - New module Text.Pandoc.Readers.RTF (exporting `readRTF`). [API change] Closes #3982. --- test/Tests/Readers/FB2.hs | 2 +- test/Tests/Readers/RTF.hs | 48 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 test/Tests/Readers/RTF.hs (limited to 'test/Tests/Readers') diff --git a/test/Tests/Readers/FB2.hs b/test/Tests/Readers/FB2.hs index 42054a235..d540f8b6a 100644 --- a/test/Tests/Readers/FB2.hs +++ b/test/Tests/Readers/FB2.hs @@ -7,7 +7,7 @@ Stability : alpha Portability : portable -Tests for the EPUB mediabag. +Tests for the FB2 reader. -} module Tests.Readers.FB2 (tests) where diff --git a/test/Tests/Readers/RTF.hs b/test/Tests/Readers/RTF.hs new file mode 100644 index 000000000..2a741bba8 --- /dev/null +++ b/test/Tests/Readers/RTF.hs @@ -0,0 +1,48 @@ +{- | + Module : Tests.Readers.RTF + Copyright : © 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : jgm@berkeley.edu + Stability : alpha + Portability : portable + +Tests for the RTF reader. +-} +module Tests.Readers.RTF (tests) where + +import Test.Tasty +import Tests.Helpers +import Test.Tasty.Golden (goldenVsString) +import qualified Data.ByteString as BS +import Text.Pandoc +import Text.Pandoc.UTF8 (toText, fromStringLazy) +import Data.Text (Text, unpack) +import System.FilePath (replaceExtension, (), (<.>)) + +rtfToNative :: Text -> Text +rtfToNative = + purely (writeNative def{ writerTemplate = Just mempty }) . + purely (readRTF def) + +rtfTest :: TestName -> TestTree +rtfTest name = goldenVsString name native + (fromStringLazy . filter (/='\r') . unpack . rtfToNative . toText + <$> BS.readFile path) + where native = replaceExtension path ".native" + path = "rtf" name <.> "rtf" + +tests :: [TestTree] +tests = map rtfTest [ "footnote" + , "accent" + , "unicode" + , "image" + , "link" + , "heading" + , "formatting" + , "list_simple" + , "list_complex" + , "bookmark" + , "table_simple" + ] + -- cgit v1.2.3 From 7ca4233793f3ba42b2c79c3526a4ab2664fea2e2 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Tue, 10 Aug 2021 11:11:33 -0700 Subject: Add test for #7488. --- test/Tests/Readers/RTF.hs | 1 + test/rtf/table_error_codes.native | 146 +++++++++++++++++++ test/rtf/table_error_codes.rtf | 300 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 447 insertions(+) create mode 100644 test/rtf/table_error_codes.native create mode 100644 test/rtf/table_error_codes.rtf (limited to 'test/Tests/Readers') diff --git a/test/Tests/Readers/RTF.hs b/test/Tests/Readers/RTF.hs index 2a741bba8..da0ddfc93 100644 --- a/test/Tests/Readers/RTF.hs +++ b/test/Tests/Readers/RTF.hs @@ -44,5 +44,6 @@ tests = map rtfTest [ "footnote" , "list_complex" , "bookmark" , "table_simple" + , "table_error_codes" ] diff --git a/test/rtf/table_error_codes.native b/test/rtf/table_error_codes.native new file mode 100644 index 000000000..bc75d4f81 --- /dev/null +++ b/test/rtf/table_error_codes.native @@ -0,0 +1,146 @@ +Pandoc (Meta {unMeta = fromList []}) +[Table ("",[],[]) (Caption Nothing + []) + [(AlignDefault,ColWidthDefault) + ,(AlignDefault,ColWidthDefault)] + (TableHead ("",[],[]) + []) + [(TableBody ("",[],[]) (RowHeadColumns 0) + [] + [Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "Code"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "Error"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "3"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocFailOnWarningError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "4"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocAppError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "5"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocTemplateError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "6"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocOptionError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "21"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocUnknownReaderError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "22"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocUnknownWriterError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "23"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocUnsupportedExtensionError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "24"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocCiteprocError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "31"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocEpubSubdirectoryError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "43"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocPDFError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "44"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocXMLError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "47"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocPDFProgramNotFoundError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "61"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocHttpError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "62"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocShouldNeverHappenError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "63"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocSomeError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "64"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocParseError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "65"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocParsecError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "66"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocMakePDFError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "67"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocSyntaxMapError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "83"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocFilterError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "91"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocMacroLoop"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "92"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocUTF8DecodingError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "93"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocIpynbDecodingError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "94"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocUnsupportedCharsetError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "97"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocCouldNotFindDataFileError"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "99"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "PandocResourceNotFound"]]]])] + (TableFoot ("",[],[]) + [])] diff --git a/test/rtf/table_error_codes.rtf b/test/rtf/table_error_codes.rtf new file mode 100644 index 000000000..79ed61e10 --- /dev/null +++ b/test/rtf/table_error_codes.rtf @@ -0,0 +1,300 @@ +{\rtf1\ansi +{ +\trowd \trgaph120 +\clbrdrb\brdrs\cellx4320\clbrdrb\brdrs\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 Code\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 Error\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 3\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocFailOnWarningError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 4\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocAppError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 5\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocTemplateError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 6\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocOptionError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 21\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocUnknownReaderError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 22\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocUnknownWriterError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 23\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocUnsupportedExtensionError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 24\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocCiteprocError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 31\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocEpubSubdirectoryError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 43\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocPDFError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 44\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocXMLError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 47\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocPDFProgramNotFoundError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 61\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocHttpError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 62\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocShouldNeverHappenError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 63\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocSomeError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 64\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocParseError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 65\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocParsecError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 66\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocMakePDFError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 67\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocSyntaxMapError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 83\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocFilterError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 91\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocMacroLoop\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 92\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocUTF8DecodingError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 93\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocIpynbDecodingError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 94\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocUnsupportedCharsetError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 97\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocCouldNotFindDataFileError\par} +\cell} +} +\intbl\row} +{ +\trowd \trgaph120 +\cellx4320\cellx8640 +\trkeep\intbl +{ +{{\pard\intbl \qr \f0 \sa0 \li0 \fi0 99\par} +\cell} +{{\pard\intbl \ql \f0 \sa0 \li0 \fi0 PandocResourceNotFound\par} +\cell} +} +\intbl\row} +{\pard \ql \f0 \sa180 \li0 \fi0 \par} +} -- cgit v1.2.3 From 06d97131e530d2ee9b14617290a157dd42c0db30 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Tue, 10 Aug 2021 22:07:48 -0700 Subject: Tests.Helpers: export testGolden and use it in RTF reader. This gives a diff output on failure. --- test/Tests/Helpers.hs | 23 ++++++++++++++++++++++- test/Tests/Readers/RTF.hs | 17 +++++------------ 2 files changed, 27 insertions(+), 13 deletions(-) (limited to 'test/Tests/Readers') diff --git a/test/Tests/Helpers.hs b/test/Tests/Helpers.hs index a48a5894e..6c06e3f71 100644 --- a/test/Tests/Helpers.hs +++ b/test/Tests/Helpers.hs @@ -16,6 +16,7 @@ module Tests.Helpers ( test , TestResult(..) , setupEnvironment , showDiff + , testGolden , (=?>) , purely , ToString(..) @@ -23,13 +24,16 @@ module Tests.Helpers ( test ) where +import System.FilePath import Data.Algorithm.Diff import qualified Data.Map as M +import qualified Text.Pandoc.UTF8 as UTF8 import Data.Text (Text, unpack) +import qualified Data.Text as T import System.Exit -import System.FilePath (takeDirectory) import qualified System.Environment as Env import Test.Tasty +import Test.Tasty.Golden.Advanced (goldenTest) import Test.Tasty.HUnit import Text.Pandoc.Builder (Blocks, Inlines, doc, plain) import Text.Pandoc.Class @@ -61,6 +65,23 @@ test fn name (input, expected) = dashes "" = replicate 72 '-' dashes x = replicate (72 - length x - 5) '-' ++ " " ++ x ++ " ---" +testGolden :: TestName -> FilePath -> FilePath -> (Text -> IO Text) -> TestTree +testGolden name expectedPath inputPath fn = + goldenTest + name + (UTF8.readFile expectedPath) + (UTF8.readFile inputPath >>= fn) + compareVals + (UTF8.writeFile expectedPath) + where + compareVals expected actual + | expected == actual = return Nothing + | otherwise = return $ Just $ + "\n--- " ++ expectedPath ++ "\n+++\n" ++ + showDiff (1,1) + (getDiff (lines . filter (/='\r') $ T.unpack actual) + (lines . filter (/='\r') $ T.unpack expected)) + -- | Set up environment for pandoc command tests. setupEnvironment :: FilePath -> IO [(String, String)] setupEnvironment testExePath = do diff --git a/test/Tests/Readers/RTF.hs b/test/Tests/Readers/RTF.hs index da0ddfc93..1b335274b 100644 --- a/test/Tests/Readers/RTF.hs +++ b/test/Tests/Readers/RTF.hs @@ -13,25 +13,18 @@ module Tests.Readers.RTF (tests) where import Test.Tasty import Tests.Helpers -import Test.Tasty.Golden (goldenVsString) -import qualified Data.ByteString as BS import Text.Pandoc -import Text.Pandoc.UTF8 (toText, fromStringLazy) -import Data.Text (Text, unpack) import System.FilePath (replaceExtension, (), (<.>)) -rtfToNative :: Text -> Text -rtfToNative = - purely (writeNative def{ writerTemplate = Just mempty }) . - purely (readRTF def) - rtfTest :: TestName -> TestTree -rtfTest name = goldenVsString name native - (fromStringLazy . filter (/='\r') . unpack . rtfToNative . toText - <$> BS.readFile path) +rtfTest name = testGolden name native path + (\t -> runIOorExplode + (readRTF def t >>= + writeNative def{ writerTemplate = Just mempty })) where native = replaceExtension path ".native" path = "rtf" name <.> "rtf" + tests :: [TestTree] tests = map rtfTest [ "footnote" , "accent" -- cgit v1.2.3 From e37cf4484d38c171d9f7477a8ae9eca9643cc426 Mon Sep 17 00:00:00 2001 From: OCzarnecki <44535552+OCzarnecki@users.noreply.github.com> Date: Mon, 16 Aug 2021 06:57:57 +0200 Subject: Multimarkdown sub- and superscripts (#5512) (#7188) Added an extension `short_subsuperscripts` which modifies the behavior of `subscript` and `superscript`, allowing subscripts or superscripts containing only alphanumerics to end with a space character (eg. `x^2 = 4` or `H~2 is combustible`). This improves support for multimarkdown. Closes #5512. Add `Ext_short_subsuperscripts` constructor to `Extension` [API change]. This is enabled by default for `markdown_mmd`. --- MANUAL.txt | 12 ++++++++++ src/Text/Pandoc/Extensions.hs | 11 ++++----- src/Text/Pandoc/Readers/Markdown.hs | 24 ++++++++++++------- test/Tests/Readers/Markdown.hs | 48 +++++++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 15 deletions(-) (limited to 'test/Tests/Readers') diff --git a/MANUAL.txt b/MANUAL.txt index a6edd8ccd..bed3b2009 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -5304,6 +5304,18 @@ For elements that accept attributes, a `data-pos` attribute is added; other elements are placed in a surrounding Div or Span elemnet with a `data-pos` attribute. +#### Extension: `short_subsuperscript` #### + +Parse multimarkdown style subscripts and superscripts, which start with +a '~' or '^' character, respectively, and include the alphanumeric sequence +that follows. For example: + + x^2 = 4 + +or + + Oxygen is O~2. + ## Markdown variants In addition to pandoc's extended Markdown, the following Markdown diff --git a/src/Text/Pandoc/Extensions.hs b/src/Text/Pandoc/Extensions.hs index ce04ce641..2ef8f64e9 100644 --- a/src/Text/Pandoc/Extensions.hs +++ b/src/Text/Pandoc/Extensions.hs @@ -124,6 +124,7 @@ data Extension = | Ext_mmd_header_identifiers -- ^ Multimarkdown style header identifiers [myid] | Ext_mmd_link_attributes -- ^ MMD style reference link attributes | Ext_mmd_title_block -- ^ Multimarkdown metadata block + | Ext_short_subsuperscripts -- ^ sub-&superscripts w/o closing char (v~i) | Ext_multiline_tables -- ^ Pandoc-style multiline tables | Ext_native_divs -- ^ Use Div blocks for contents of
tags | Ext_native_spans -- ^ Use Span inlines for contents of @@ -286,14 +287,9 @@ multimarkdownExtensions = extensionsFromList , Ext_auto_identifiers , Ext_mmd_header_identifiers , Ext_implicit_figures - -- Note: MMD's syntax for superscripts and subscripts - -- is a bit more permissive than pandoc's, allowing - -- e^2 and a~1 instead of e^2^ and a~1~, so even with - -- these options we don't have full support for MMD - -- superscripts and subscripts, but there's no reason - -- not to include these: - , Ext_superscript + , Ext_short_subsuperscripts , Ext_subscript + , Ext_superscript , Ext_backtick_code_blocks , Ext_spaced_reference_links -- So far only in dev version of mmd: @@ -464,6 +460,7 @@ getAllExtensions f = universalExtensions <> getAll f , Ext_gutenberg , Ext_smart , Ext_literate_haskell + , Ext_short_subsuperscripts , Ext_rebase_relative_paths ] getAll "markdown_strict" = allMarkdownExtensions diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 2dc7ddf52..536e502cf 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -1692,21 +1692,29 @@ strikeout = fmap B.strikeout <$> superscript :: PandocMonad m => MarkdownParser m (F Inlines) superscript = do - guardEnabled Ext_superscript fmap B.superscript <$> try (do char '^' - mconcat <$> many1Till (do notFollowedBy spaceChar - notFollowedBy newline - inline) (char '^')) + mconcat <$> (try regularSuperscript <|> try mmdShortSuperscript)) + where regularSuperscript = many1Till (do guardEnabled Ext_superscript + notFollowedBy spaceChar + notFollowedBy newline + inline) (char '^') + mmdShortSuperscript = do guardEnabled Ext_short_subsuperscripts + result <- take1WhileP isAlphaNum + return $ return $ return $ B.str result subscript :: PandocMonad m => MarkdownParser m (F Inlines) subscript = do - guardEnabled Ext_subscript fmap B.subscript <$> try (do char '~' - mconcat <$> many1Till (do notFollowedBy spaceChar - notFollowedBy newline - inline) (char '~')) + mconcat <$> (try regularSubscript <|> mmdShortSubscript)) + where regularSubscript = many1Till (do guardEnabled Ext_subscript + notFollowedBy spaceChar + notFollowedBy newline + inline) (char '~') + mmdShortSubscript = do guardEnabled Ext_short_subsuperscripts + result <- take1WhileP isAlphaNum + return $ return $ return $ B.str result whitespace :: PandocMonad m => MarkdownParser m (F Inlines) whitespace = spaceChar >> return <$> (lb <|> regsp) "whitespace" diff --git a/test/Tests/Readers/Markdown.hs b/test/Tests/Readers/Markdown.hs index f055ab197..02fc0d8ce 100644 --- a/test/Tests/Readers/Markdown.hs +++ b/test/Tests/Readers/Markdown.hs @@ -36,6 +36,9 @@ markdownGH :: Text -> Pandoc markdownGH = purely $ readMarkdown def { readerExtensions = githubMarkdownExtensions } +markdownMMD :: Text -> Pandoc +markdownMMD = purely $ readMarkdown def { + readerExtensions = multimarkdownExtensions } infix 4 =: (=:) :: ToString c => String -> (Text, c) -> TestTree @@ -360,6 +363,51 @@ tests = [ testGroup "inline code" ("**this should \"be bold**" =?> para (strong "this should \8220be bold")) ] + , testGroup "sub- and superscripts" + [ + test markdownMMD "normal subscript" + ("H~2~" + =?> para ("H" <> subscript "2")) + , test markdownMMD "normal superscript" + ("x^3^" + =?> para ("x" <> superscript "3")) + , test markdownMMD "short subscript delimeted by space" + ("O~2 is dangerous" + =?> para ("O" <> subscript "2" <> space <> "is dangerous")) + , test markdownMMD "short subscript delimeted by newline" + ("O~2\n" + =?> para ("O" <> subscript "2")) + , test markdownMMD "short subscript delimeted by EOF" + ("O~2" + =?> para ("O" <> subscript "2")) + , test markdownMMD "short subscript delimited by punctuation" + ("O~2." + =?> para ("O" <> subscript "2" <> ".")) + , test markdownMMD "short subscript delimited by emph" + ("O~2*combustible!*" + =?> para ("O" <> subscript "2" <> emph "combustible!")) + , test markdownMMD "no nesting in short subscripts" + ("y~*2*" + =?> para ("y~" <> emph "2")) + , test markdownMMD "short superscript delimeted by space" + ("x^2 = y" + =?> para ("x" <> superscript "2" <> space <> "= y")) + , test markdownMMD "short superscript delimeted by newline" + ("x^2\n" + =?> para ("x" <> superscript "2")) + , test markdownMMD "short superscript delimeted by ExF" + ("x^2" + =?> para ("x" <> superscript "2")) + , test markdownMMD "short superscript delimited by punctuation" + ("x^2." + =?> para ("x" <> superscript "2" <> ".")) + , test markdownMMD "short superscript delimited by emph" + ("x^2*combustible!*" + =?> para ("x" <> superscript "2" <> emph "combustible!")) + , test markdownMMD "no nesting in short superscripts" + ("y^*2*" + =?> para ("y^" <> emph "2")) + ] , testGroup "footnotes" [ "indent followed by newline and flush-left text" =: "[^1]\n\n[^1]: my note\n\n \nnot in note\n" -- cgit v1.2.3 From 0f98cbff4b61b8e79f386f77d18b3218f1214b25 Mon Sep 17 00:00:00 2001 From: Milan Bracke Date: Fri, 1 Oct 2021 11:34:14 +0200 Subject: Avoid blockquote when parent style has more indent When a paragraph has an indentation different from the parent (named) style, it used to be considered a blockquote. But this only makes sense when the paragraph has more indentation. So this commit adds a check for the indentation of the parent style. --- src/Text/Pandoc/Readers/Docx.hs | 66 +++++++++++----------- src/Text/Pandoc/Readers/Docx/Parse.hs | 27 ++------- src/Text/Pandoc/Readers/Docx/Parse/Styles.hs | 26 +++++++++ test/Tests/Readers/Docx.hs | 4 ++ test/docx/image_vml.native | 5 +- test/docx/relative_indentation_blockquotes.docx | Bin 0 -> 12492 bytes test/docx/relative_indentation_blockquotes.native | 4 ++ 7 files changed, 76 insertions(+), 56 deletions(-) create mode 100644 test/docx/relative_indentation_blockquotes.docx create mode 100644 test/docx/relative_indentation_blockquotes.native (limited to 'test/Tests/Readers') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index dd6f39431..66cd84291 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -535,34 +535,36 @@ trimSps (Many ils) = Many $ Seq.dropWhileL isSp $Seq.dropWhileR isSp ils extraAttr :: (Eq (StyleName a), HasStyleName a) => a -> Attr extraAttr s = ("", [], [("custom-style", fromStyleName $ getStyleName s)]) -parStyleToTransform :: PandocMonad m => ParagraphStyle -> DocxContext m (Blocks -> Blocks) -parStyleToTransform pPr = case pStyle pPr of - c@(getStyleName -> styleName):cs - | styleName `elem` divsToKeep -> do - let pPr' = pPr { pStyle = cs } - transform <- parStyleToTransform pPr' - return $ divWith ("", [normalizeToClassName styleName], []) . transform - | styleName `elem` listParagraphStyles -> do - let pPr' = pPr { pStyle = cs, indentation = Nothing} - transform <- parStyleToTransform pPr' - return $ divWith ("", [normalizeToClassName styleName], []) . transform - | otherwise -> do - let pPr' = pPr { pStyle = cs } - transform <- parStyleToTransform pPr' - styles <- asks (isEnabled Ext_styles . docxOptions) - return $ - (if styles then divWith (extraAttr c) else id) - . (if isBlockQuote c then blockQuote else id) - . transform - [] - | Just left <- indentation pPr >>= leftParIndent -> do - let pPr' = pPr { indentation = Nothing } - hang = fromMaybe 0 $ indentation pPr >>= hangingParIndent - transform <- parStyleToTransform pPr' - return $ if (left - hang) > 0 - then blockQuote . transform - else transform - | otherwise -> return id +paragraphStyleToTransform :: PandocMonad m => ParagraphStyle -> DocxContext m (Blocks -> Blocks) +paragraphStyleToTransform pPr = + let stylenames = map getStyleName (pStyle pPr) + transform = if (`elem` listParagraphStyles) `any` stylenames || relativeIndent pPr <= 0 + then id + else blockQuote + in do + extStylesEnabled <- asks (isEnabled Ext_styles . docxOptions) + return $ foldr (\parStyle transform' -> + (parStyleToTransform extStylesEnabled parStyle) . transform' + ) transform (pStyle pPr) + +parStyleToTransform :: Bool -> ParStyle -> Blocks -> Blocks +parStyleToTransform extStylesEnabled parStyle@(getStyleName -> styleName) + | (styleName `elem` divsToKeep) || (styleName `elem` listParagraphStyles) = + divWith ("", [normalizeToClassName styleName], []) + | otherwise = + (if extStylesEnabled then divWith (extraAttr parStyle) else id) + . (if isBlockQuote parStyle then blockQuote else id) + +-- The relative indent is the indentation minus the indentation of the parent style. +-- This tells us whether this paragraph in particular was indented more and thus +-- should be considered a block quote. +relativeIndent :: ParagraphStyle -> Integer +relativeIndent pPr = + let pStyleLeft = fromMaybe 0 $ pStyleIndentation pPr >>= leftParIndent + pStyleHang = fromMaybe 0 $ pStyleIndentation pPr >>= hangingParIndent + left = fromMaybe pStyleLeft $ indentation pPr >>= leftParIndent + hang = fromMaybe pStyleHang $ indentation pPr >>= hangingParIndent + in (left - hang) - (pStyleLeft - pStyleHang) normalizeToClassName :: (FromStyleName a) => a -> T.Text normalizeToClassName = T.map go . fromStyleName @@ -581,7 +583,7 @@ bodyPartToBlocks (Paragraph pPr parparts) local (\s -> s{ docxInBidi = True }) (bodyPartToBlocks (Paragraph pPr' parparts)) | isCodeDiv pPr = do - transform <- parStyleToTransform pPr + transform <- paragraphStyleToTransform pPr return $ transform $ codeBlock $ @@ -608,7 +610,7 @@ bodyPartToBlocks (Paragraph pPr parparts) else prevParaIls <> space) <> ils' handleInsertion = do modify $ \s -> s {docxPrevPara = mempty} - transform <- parStyleToTransform pPr' + transform <- paragraphStyleToTransform pPr' return $ transform $ paraOrPlain ils'' opts <- asks docxOptions case (pChange pPr', readerTrackChanges opts) of @@ -623,7 +625,7 @@ bodyPartToBlocks (Paragraph pPr parparts) , AllChanges) -> do let attr = ("", ["paragraph-insertion"], addAuthorAndDate cAuthor cDate) insertMark = spanWith attr mempty - transform <- parStyleToTransform pPr' + transform <- paragraphStyleToTransform pPr' return $ transform $ paraOrPlain $ ils'' <> insertMark (Just (TrackedChange Deletion _), AcceptChanges) -> do @@ -635,7 +637,7 @@ bodyPartToBlocks (Paragraph pPr parparts) , AllChanges) -> do let attr = ("", ["paragraph-deletion"], addAuthorAndDate cAuthor cDate) insertMark = spanWith attr mempty - transform <- parStyleToTransform pPr' + transform <- paragraphStyleToTransform pPr' return $ transform $ paraOrPlain $ ils'' <> insertMark _ -> handleInsertion diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 5f29ac41a..e4d3ea6f8 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -50,6 +50,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , archiveToDocxWithWarnings , getStyleNames , pHeading + , pStyleIndentation , constructBogusParStyleData , leftBiasedMergeRunStyle , rowsToRowspans @@ -194,11 +195,6 @@ data Notes = Notes NameSpaces data Comments = Comments NameSpaces (M.Map T.Text Element) deriving Show -data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer - , rightParIndent :: Maybe Integer - , hangingParIndent :: Maybe Integer} - deriving Show - data ChangeType = Insertion | Deletion deriving Show @@ -439,6 +435,7 @@ getStyleNames = fmap getStyleName constructBogusParStyleData :: ParaStyleName -> ParStyle constructBogusParStyleData stName = ParStyle { headingLev = Nothing + , indent = Nothing , numInfo = Nothing , psParentStyle = Nothing , pStyleName = stName @@ -673,20 +670,6 @@ elemToCell ns element | isElem ns "w" "tc" element = return $ Cell (fromMaybe 1 gridSpan) vMerge cellContents elemToCell _ _ = throwError WrongElem -elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation -elemToParIndentation ns element | isElem ns "w" "ind" element = - Just ParIndentation { - leftParIndent = - findAttrByName ns "w" "left" element >>= - stringToInteger - , rightParIndent = - findAttrByName ns "w" "right" element >>= - stringToInteger - , hangingParIndent = - findAttrByName ns "w" "hanging" element >>= - stringToInteger } -elemToParIndentation _ _ = Nothing - testBitMask :: Text -> Int -> Bool testBitMask bitMaskS n = case (reads ("0x" ++ T.unpack bitMaskS) :: [(Int, String)]) of @@ -699,6 +682,9 @@ pHeading = getParStyleField headingLev . pStyle pNumInfo :: ParagraphStyle -> Maybe (T.Text, T.Text) pNumInfo = getParStyleField numInfo . pStyle +pStyleIndentation :: ParagraphStyle -> Maybe ParIndentation +pStyleIndentation style = (getParStyleField indent . pStyle) style + elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "w" "p" element @@ -1086,8 +1072,7 @@ elemToParagraphStyle ns element sty in ParagraphStyle {pStyle = mapMaybe (`M.lookup` sty) style , indentation = - findChildByName ns "w" "ind" pPr >>= - elemToParIndentation ns + getIndentation ns element , dropCap = case findChildByName ns "w" "framePr" pPr >>= diff --git a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs index 0d7271d6a..bb28b3009 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs @@ -21,6 +21,7 @@ module Text.Pandoc.Readers.Docx.Parse.Styles ( , CharStyle , ParaStyleId(..) , ParStyle(..) + , ParIndentation(..) , RunStyle(..) , HasStyleName , StyleName @@ -37,6 +38,7 @@ module Text.Pandoc.Readers.Docx.Parse.Styles ( , fromStyleName , fromStyleId , stringToInteger + , getIndentation , getNumInfo , elemToRunStyle , defaultRunStyle @@ -115,7 +117,13 @@ data RunStyle = RunStyle { isBold :: Maybe Bool } deriving Show +data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer + , rightParIndent :: Maybe Integer + , hangingParIndent :: Maybe Integer} + deriving Show + data ParStyle = ParStyle { headingLev :: Maybe (ParaStyleName, Int) + , indent :: Maybe ParIndentation , numInfo :: Maybe (T.Text, T.Text) , psParentStyle :: Maybe ParStyle , pStyleName :: ParaStyleName @@ -290,6 +298,23 @@ getHeaderLevel ns element , n > 0 = Just (styleName, fromInteger n) getHeaderLevel _ _ = Nothing +getIndentation :: NameSpaces -> Element -> Maybe ParIndentation +getIndentation ns el = do + indElement <- findChildByName ns "w" "pPr" el >>= + findChildByName ns "w" "ind" + return $ ParIndentation + { + leftParIndent = findAttrByName ns "w" "left" indElement <|> + findAttrByName ns "w" "start" indElement >>= + stringToInteger + , rightParIndent = findAttrByName ns "w" "right" indElement <|> + findAttrByName ns "w" "end" indElement >>= + stringToInteger + , hangingParIndent = (findAttrByName ns "w" "hanging" indElement >>= stringToInteger) <|> + fmap negate + (findAttrByName ns "w" "firstLine" indElement >>= stringToInteger) + } + getElementStyleName :: Coercible T.Text a => NameSpaces -> Element -> Maybe a getElementStyleName ns el = coerce <$> ((findChildByName ns "w" "name" el >>= findAttrByName ns "w" "val") @@ -314,6 +339,7 @@ elemToParStyleData ns element parentStyle = Just $ ParStyle { headingLev = getHeaderLevel ns element + , indent = getIndentation ns element , numInfo = getNumInfo ns element , psParentStyle = parentStyle , pStyleName = styleName diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs index 220c7d9c5..2f28af317 100644 --- a/test/Tests/Readers/Docx.hs +++ b/test/Tests/Readers/Docx.hs @@ -305,6 +305,10 @@ tests = [ testGroup "document" "blockquotes (parsing indent as blockquote)" "docx/block_quotes.docx" "docx/block_quotes_parse_indent.native" + , testCompare + "blockquotes (parsing indent relative to the indent of the parent style as blockquote)" + "docx/relative_indentation_blockquotes.docx" + "docx/relative_indentation_blockquotes.native" , testCompare "hanging indents" "docx/hanging_indent.docx" diff --git a/test/docx/image_vml.native b/test/docx/image_vml.native index e9fded614..5991f333c 100644 --- a/test/docx/image_vml.native +++ b/test/docx/image_vml.native @@ -1,4 +1,3 @@ [Header 1 ("vml-image",[],[]) [Strong [Str "VML",Space,Str "Image"]] -,BlockQuote - [Para [Str "It",Space,Str "should",Space,Str "follow",Space,Str "below:"] - ,Para [Image ("",[],[]) [] ("media/image4.jpeg","")]]] +,Para [Str "It",Space,Str "should",Space,Str "follow",Space,Str "below:"] +,Para [Image ("",[],[]) [] ("media/image4.jpeg","")]] diff --git a/test/docx/relative_indentation_blockquotes.docx b/test/docx/relative_indentation_blockquotes.docx new file mode 100644 index 000000000..f318a8509 Binary files /dev/null and b/test/docx/relative_indentation_blockquotes.docx differ diff --git a/test/docx/relative_indentation_blockquotes.native b/test/docx/relative_indentation_blockquotes.native new file mode 100644 index 000000000..37806f289 --- /dev/null +++ b/test/docx/relative_indentation_blockquotes.native @@ -0,0 +1,4 @@ +[Header 1 ("indentation-blockquotes",[],[]) [Str "Indentation",Space,Str "blockquotes"] +,Para [Str "Normal",Space,Str "list",Space,Str "paragraph"] +,Para [Str "List",Space,Str "paragraph",Space,Str "with",Space,Str "less",Space,Str "indent"] +,BlockQuote [Para [Str "List",Space,Str "paragraph",Space,Str "with",Space,Str "more",Space,Str "indent"]]] -- cgit v1.2.3 From 193f6bfebaa43d0d6749d10a4e7ca78a0d31361d Mon Sep 17 00:00:00 2001 From: Milan Bracke Date: Mon, 14 Jun 2021 15:00:36 +0200 Subject: Docx reader: fix handling of nested fields Fields delimited by fldChar elements can contain other fields. Before, the nested fields would be ignored, except for the end, which would be considered the end of the parent field. To fix this issue, fields needed to be considered containing ParParts instead of Runs, since a Run can't represent complex enough structures. This also impacted Hyperlinks since they can originate from a field. --- src/Text/Pandoc/Readers/Docx.hs | 18 +-- src/Text/Pandoc/Readers/Docx/Parse.hs | 247 +++++++++++++++++++--------------- test/Tests/Readers/Docx.hs | 4 + test/docx/nested_instrText.docx | Bin 0 -> 14112 bytes test/docx/nested_instrText.native | 5 + 5 files changed, 159 insertions(+), 115 deletions(-) create mode 100644 test/docx/nested_instrText.docx create mode 100644 test/docx/nested_instrText.native (limited to 'test/Tests/Readers') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 66cd84291..462e3c679 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -246,8 +246,8 @@ runToText _ = "" parPartToText :: ParPart -> T.Text parPartToText (PlainRun run) = runToText run -parPartToText (InternalHyperLink _ runs) = T.concat $ map runToText runs -parPartToText (ExternalHyperLink _ runs) = T.concat $ map runToText runs +parPartToText (InternalHyperLink _ children) = T.concat $ map parPartToText children +parPartToText (ExternalHyperLink _ children) = T.concat $ map parPartToText children parPartToText _ = "" blacklistedCharStyles :: [CharStyleName] @@ -437,18 +437,18 @@ parPartToInlines' Chart = return $ spanWith ("", ["chart"], []) $ text "[CHART]" parPartToInlines' Diagram = return $ spanWith ("", ["diagram"], []) $ text "[DIAGRAM]" -parPartToInlines' (InternalHyperLink anchor runs) = do - ils <- smushInlines <$> mapM runToInlines runs +parPartToInlines' (InternalHyperLink anchor children) = do + ils <- smushInlines <$> mapM parPartToInlines' children return $ link ("#" <> anchor) "" ils -parPartToInlines' (ExternalHyperLink target runs) = do - ils <- smushInlines <$> mapM runToInlines runs +parPartToInlines' (ExternalHyperLink target children) = do + ils <- smushInlines <$> mapM parPartToInlines' children return $ link target "" ils parPartToInlines' (PlainOMath exps) = return $ math $ writeTeX exps -parPartToInlines' (Field info runs) = +parPartToInlines' (Field info children) = case info of - HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url runs - UnknownField -> smushInlines <$> mapM runToInlines runs + HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url children + _ -> smushInlines <$> mapM parPartToInlines' children parPartToInlines' NullParPart = return mempty isAnchorSpan :: Inline -> Bool diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index e4d3ea6f8..a97d4b3d1 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -93,14 +93,13 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes deriving Show data ReaderState = ReaderState { stateWarnings :: [T.Text] - , stateFldCharState :: FldCharState + , stateFldCharState :: [FldCharState] } deriving Show data FldCharState = FldCharOpen | FldCharFieldInfo FieldInfo - | FldCharContent FieldInfo [Run] - | FldCharClosed + | FldCharContent FieldInfo [ParPart] deriving (Show) data DocxError = DocxError @@ -314,13 +313,13 @@ data ParPart = PlainRun Run | CommentStart CommentId Author (Maybe CommentDate) [BodyPart] | CommentEnd CommentId | BookMark BookMarkId Anchor - | InternalHyperLink Anchor [Run] - | ExternalHyperLink URL [Run] + | InternalHyperLink Anchor [ParPart] + | ExternalHyperLink URL [ParPart] | Drawing FilePath T.Text T.Text B.ByteString Extent -- title, alt | Chart -- placeholder for now | Diagram -- placeholder for now | PlainOMath [Exp] - | Field FieldInfo [Run] + | Field FieldInfo [ParPart] | NullParPart -- when we need to return nothing, but -- not because of an error. deriving Show @@ -373,7 +372,7 @@ archiveToDocxWithWarnings archive = do , envDocXmlPath = docXmlPath } rState = ReaderState { stateWarnings = [] - , stateFldCharState = FldCharClosed + , stateFldCharState = [] } (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState case eitherDoc of @@ -701,28 +700,31 @@ elemToBodyPart ns element elemToBodyPart ns element | isElem ns "w" "p" element = do parstyle <- elemToParagraphStyle ns element <$> asks envParStyles - parparts <- mapD (elemToParPart ns) (elChildren element) + parparts' <- mapD (elemToParPart ns) (elChildren element) + fldCharState <- gets stateFldCharState + modify $ \st -> st {stateFldCharState = emptyFldCharContents fldCharState} -- Word uses list enumeration for numbered headings, so we only -- want to infer a list from the styles if it is NOT a heading. - case pHeading parstyle of - Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do - levelInfo <- lookupLevel numId lvl <$> asks envNumbering - return $ ListItem parstyle numId lvl levelInfo parparts - _ -> let - hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle) - - hasSimpleTableField = fromMaybe False $ do - fldSimple <- findChildByName ns "w" "fldSimple" element - instr <- findAttrByName ns "w" "instr" fldSimple - pure ("Table" `elem` T.words instr) - - hasComplexTableField = fromMaybe False $ do - instrText <- findElementByName ns "w" "instrText" element - pure ("Table" `elem` T.words (strContent instrText)) - - in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField) - then return $ TblCaption parstyle parparts - else return $ Paragraph parstyle parparts + let parparts = parparts' ++ (openFldCharsToParParts fldCharState) in + case pHeading parstyle of + Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do + levelInfo <- lookupLevel numId lvl <$> asks envNumbering + return $ ListItem parstyle numId lvl levelInfo parparts + _ -> let + hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle) + + hasSimpleTableField = fromMaybe False $ do + fldSimple <- findChildByName ns "w" "fldSimple" element + instr <- findAttrByName ns "w" "instr" fldSimple + pure ("Table" `elem` T.words instr) + + hasComplexTableField = fromMaybe False $ do + instrText <- findElementByName ns "w" "instrText" element + pure ("Table" `elem` T.words (strContent instrText)) + + in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField) + then return $ TblCaption parstyle parparts + else return $ Paragraph parstyle parparts elemToBodyPart ns element | isElem ns "w" "tbl" element = do @@ -754,6 +756,19 @@ lookupRelationship docLocation relid rels = where pairs = map (\(Relationship loc relid' target) -> ((loc, relid'), target)) rels +openFldCharsToParParts :: [FldCharState] -> [ParPart] +openFldCharsToParParts [] = [] +openFldCharsToParParts (FldCharContent info children : ancestors) = case openFldCharsToParParts ancestors of + Field parentInfo siblings : _ -> [Field parentInfo $ siblings ++ [Field info $ reverse children]] + _ -> [Field info $ reverse children] +openFldCharsToParParts (_ : ancestors) = openFldCharsToParParts ancestors + +emptyFldCharContents :: [FldCharState] -> [FldCharState] +emptyFldCharContents = map + (\x -> case x of + FldCharContent info _ -> FldCharContent info [] + _ -> x) + expandDrawingId :: T.Text -> D (FilePath, B.ByteString) expandDrawingId s = do location <- asks envLocation @@ -778,51 +793,6 @@ getTitleAndAlt ns element = in (title, alt) elemToParPart :: NameSpaces -> Element -> D ParPart -elemToParPart ns element - | isElem ns "w" "r" element - , Just drawingElem <- findChildByName ns "w" "drawing" element - , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture" - , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem - = let (title, alt) = getTitleAndAlt ns drawingElem - a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" - drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrByName ns "r" "embed" - in - case drawing of - Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) - Nothing -> throwError WrongElem --- The two cases below are an attempt to deal with images in deprecated vml format. --- Todo: check out title and attr for deprecated format. -elemToParPart ns element - | isElem ns "w" "r" element - , Just _ <- findChildByName ns "w" "pict" element = - let drawing = findElement (elemName ns "v" "imagedata") element - >>= findAttrByName ns "r" "id" - in - case drawing of - Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) - Nothing -> throwError WrongElem -elemToParPart ns element - | isElem ns "w" "r" element - , Just objectElem <- findChildByName ns "w" "object" element - , Just shapeElem <- findChildByName ns "v" "shape" objectElem - , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem - , Just drawingId <- findAttrByName ns "r" "id" imagedataElem - = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) --- Diagram -elemToParPart ns element - | isElem ns "w" "r" element - , Just drawingElem <- findChildByName ns "w" "drawing" element - , d_ns <- "http://schemas.openxmlformats.org/drawingml/2006/diagram" - , Just _ <- findElement (QName "relIds" (Just d_ns) (Just "dgm")) drawingElem - = return Diagram --- Chart -elemToParPart ns element - | isElem ns "w" "r" element - , Just drawingElem <- findChildByName ns "w" "drawing" element - , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" - , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem - = return Chart {- The next one is a bit complicated. fldChar fields work by first having a in a run, then a run with @@ -854,8 +824,13 @@ example (omissions and my comments in brackets): So we do this in a number of steps. If we encounter the fldchar begin tag, we start open a fldchar state variable (see state above). We add the instrtext to it as FieldInfo. Then we close that and start adding -the runs when we get to separate. Then when we get to end, we produce -the Field type with appropriate FieldInfo and Runs. +the children when we get to separate. Then when we get to end, we produce +the Field type with appropriate FieldInfo and ParParts. + +Since there can be nested fields, the fldchar state needs to be a stack, +so we can have multiple fldchars open at the same time. When a fldchar is +closed, we either add the resulting field to its parent or we return it if +there is no parent. -} elemToParPart ns element | isElem ns "w" "r" element @@ -863,78 +838,138 @@ elemToParPart ns element , Just fldCharType <- findAttrByName ns "w" "fldCharType" fldChar = do fldCharState <- gets stateFldCharState case fldCharState of - FldCharClosed | fldCharType == "begin" -> do - modify $ \st -> st {stateFldCharState = FldCharOpen} + _ | fldCharType == "begin" -> do + modify $ \st -> st {stateFldCharState = FldCharOpen : fldCharState} return NullParPart - FldCharFieldInfo info | fldCharType == "separate" -> do - modify $ \st -> st {stateFldCharState = FldCharContent info []} + FldCharFieldInfo info : ancestors | fldCharType == "separate" -> do + modify $ \st -> st {stateFldCharState = FldCharContent info [] : ancestors} return NullParPart - FldCharContent info runs | fldCharType == "end" -> do - modify $ \st -> st {stateFldCharState = FldCharClosed} - return $ Field info $ reverse runs + [FldCharContent info children] | fldCharType == "end" -> do + modify $ \st -> st {stateFldCharState = []} + return $ Field info $ reverse children + FldCharContent info children : FldCharContent parentInfo siblings : ancestors | fldCharType == "end" -> + let parent = FldCharContent parentInfo $ (Field info (reverse children)) : siblings in do + modify $ \st -> st {stateFldCharState = parent : ancestors} + return NullParPart _ -> throwError WrongElem elemToParPart ns element | isElem ns "w" "r" element , Just instrText <- findChildByName ns "w" "instrText" element = do fldCharState <- gets stateFldCharState case fldCharState of - FldCharOpen -> do + FldCharOpen : ancestors -> do info <- eitherToD $ parseFieldInfo $ strContent instrText - modify $ \st -> st{stateFldCharState = FldCharFieldInfo info} + modify $ \st -> st {stateFldCharState = FldCharFieldInfo info : ancestors} return NullParPart _ -> return NullParPart -elemToParPart ns element +{- +There is an open fldchar, so we calculate the element and add it to the +children. For this we need to first change the fldchar state to an empty +stack to avoid descendants of children simply being added to the state instead +of to their direct parent element. This would happen in the case of a +w:hyperlink element for example. +-} +elemToParPart ns element = do + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharContent info children : ancestors -> do + modify $ \st -> st {stateFldCharState = []} + parPart <- elemToParPart' ns element `catchError` \_ -> return NullParPart + modify $ \st -> st{stateFldCharState = FldCharContent info (parPart : children) : ancestors} + return NullParPart + _ -> elemToParPart' ns element + +elemToParPart' :: NameSpaces -> Element -> D ParPart +elemToParPart' ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChildByName ns "w" "drawing" element + , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture" + , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem + = let (title, alt) = getTitleAndAlt ns drawingElem + a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem + >>= findAttrByName ns "r" "embed" + in + case drawing of + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) + Nothing -> throwError WrongElem +-- The two cases below are an attempt to deal with images in deprecated vml format. +-- Todo: check out title and attr for deprecated format. +elemToParPart' ns element + | isElem ns "w" "r" element + , Just _ <- findChildByName ns "w" "pict" element = + let drawing = findElement (elemName ns "v" "imagedata") element + >>= findAttrByName ns "r" "id" + in + case drawing of + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) + Nothing -> throwError WrongElem +elemToParPart' ns element + | isElem ns "w" "r" element + , Just objectElem <- findChildByName ns "w" "object" element + , Just shapeElem <- findChildByName ns "v" "shape" objectElem + , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem + , Just drawingId <- findAttrByName ns "r" "id" imagedataElem + = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) +-- Diagram +elemToParPart' ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChildByName ns "w" "drawing" element + , d_ns <- "http://schemas.openxmlformats.org/drawingml/2006/diagram" + , Just _ <- findElement (QName "relIds" (Just d_ns) (Just "dgm")) drawingElem + = return Diagram +-- Chart +elemToParPart' ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChildByName ns "w" "drawing" element + , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" + , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem + = return Chart +elemToParPart' ns element | isElem ns "w" "r" element = do run <- elemToRun ns element - -- we check to see if we have an open FldChar in state that we're - -- recording. - fldCharState <- gets stateFldCharState - case fldCharState of - FldCharContent info runs -> do - modify $ \st -> st{stateFldCharState = FldCharContent info (run : runs)} - return NullParPart - _ -> return $ PlainRun run -elemToParPart ns element + return $ PlainRun run +elemToParPart' ns element | Just change <- getTrackedChange ns element = do runs <- mapD (elemToRun ns) (elChildren element) return $ ChangedRuns change runs -elemToParPart ns element +elemToParPart' ns element | isElem ns "w" "bookmarkStart" element , Just bmId <- findAttrByName ns "w" "id" element , Just bmName <- findAttrByName ns "w" "name" element = return $ BookMark bmId bmName -elemToParPart ns element +elemToParPart' ns element | isElem ns "w" "hyperlink" element , Just relId <- findAttrByName ns "r" "id" element = do location <- asks envLocation - runs <- mapD (elemToRun ns) (elChildren element) + children <- mapD (elemToParPart ns) (elChildren element) rels <- asks envRelationships case lookupRelationship location relId rels of Just target -> case findAttrByName ns "w" "anchor" element of - Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) runs - Nothing -> return $ ExternalHyperLink target runs - Nothing -> return $ ExternalHyperLink "" runs -elemToParPart ns element + Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) children + Nothing -> return $ ExternalHyperLink target children + Nothing -> return $ ExternalHyperLink "" children +elemToParPart' ns element | isElem ns "w" "hyperlink" element , Just anchor <- findAttrByName ns "w" "anchor" element = do - runs <- mapD (elemToRun ns) (elChildren element) - return $ InternalHyperLink anchor runs -elemToParPart ns element + children <- mapD (elemToParPart ns) (elChildren element) + return $ InternalHyperLink anchor children +elemToParPart' ns element | isElem ns "w" "commentRangeStart" element , Just cmtId <- findAttrByName ns "w" "id" element = do (Comments _ commentMap) <- asks envComments case M.lookup cmtId commentMap of Just cmtElem -> elemToCommentStart ns cmtElem Nothing -> throwError WrongElem -elemToParPart ns element +elemToParPart' ns element | isElem ns "w" "commentRangeEnd" element , Just cmtId <- findAttrByName ns "w" "id" element = return $ CommentEnd cmtId -elemToParPart ns element +elemToParPart' ns element | isElem ns "m" "oMath" element = fmap PlainOMath (eitherToD $ readOMML $ showElement element) -elemToParPart _ _ = throwError WrongElem +elemToParPart' _ _ = throwError WrongElem elemToCommentStart :: NameSpaces -> Element -> D ParPart elemToCommentStart ns element diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs index 2f28af317..af6023836 100644 --- a/test/Tests/Readers/Docx.hs +++ b/test/Tests/Readers/Docx.hs @@ -147,6 +147,10 @@ tests = [ testGroup "document" "hyperlinks in tag" "docx/instrText_hyperlink.docx" "docx/instrText_hyperlink.native" + , testCompare + "nested fields with tag" + "docx/nested_instrText.docx" + "docx/nested_instrText.native" , testCompare "inline image" "docx/image.docx" diff --git a/test/docx/nested_instrText.docx b/test/docx/nested_instrText.docx new file mode 100644 index 000000000..532584193 Binary files /dev/null and b/test/docx/nested_instrText.docx differ diff --git a/test/docx/nested_instrText.native b/test/docx/nested_instrText.native new file mode 100644 index 000000000..730b041f5 --- /dev/null +++ b/test/docx/nested_instrText.native @@ -0,0 +1,5 @@ +[Para [Str "\24076\26395\28145\20837\20102\35299\30340\35835\32773\21487\20197\21435\30475David",Space,Str "French",Space,Str "Belding\21644Kevin",Space,Str "J.",Space,Str "Mitchell\30340" + ,Link ("",[],[]) [Str "Foundations",Space,Str "of",Space,Str "Analysis,",Space,Str "1/16/18",Space,Str "8:40:00",Space,Str "AM,",Space,Str "2nd",Space,Str "Edition"] ("https://books.google.com/books?id=sp_Zcb9ot90C&lpg=PR4&hl=zh-CN&pg=PA19#v=onepage&q&f=true","") + ,Str ",\21487\20174\&19\39029\30475\36215\65292\25110D.C.",Space,Str "Goldrei\30340",Space + ,Link ("",[],[]) [Str "Classic",Space,Str "Set",Space,Str "Theory:",Space,Str "For",Space,Str "Guided",Space,Str "Independent",Space,Str "Study"] ("https://books.google.ae/books?id=dlc0DwAAQBAJ&lpg=PT29&hl=zh-CN&pg=PT26#v=onepage&q&f=true","") + ,Str "\65292\20174\31532\20108\31456\30475\36215\65292\38405\35835\26102\35201\27880\24847\26412\25991\19982\36825\20123\20070\25152\19981\21516\30340\26159\24182\27809\26377\25226\23454\25968\30475\20316\26159\26377\29702\25968\38598\30340\20998\21106\12290"]] -- cgit v1.2.3 From 6acc82c5d2885c596c52e6c35bed8fe08f535066 Mon Sep 17 00:00:00 2001 From: Milan Bracke Date: Fri, 11 Jun 2021 09:26:09 +0200 Subject: Docx parser: implement PAGEREF fields These fields, often used in tables of contents, can be a hyperlink. --- src/Text/Pandoc/Readers/Docx.hs | 1 + src/Text/Pandoc/Readers/Docx/Fields.hs | 25 +++++++++++++++++++++++++ test/Tests/Readers/Docx.hs | 4 ++++ test/docx/0_level_headers.native | 6 +++--- test/docx/golden/nested_anchors_in_header.docx | Bin 10126 -> 10163 bytes test/docx/nested_anchors_in_header.native | 8 ++++---- test/docx/pageref.docx | Bin 0 -> 14431 bytes test/docx/pageref.native | 4 ++++ 8 files changed, 41 insertions(+), 7 deletions(-) create mode 100644 test/docx/pageref.docx create mode 100644 test/docx/pageref.native (limited to 'test/Tests/Readers') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 462e3c679..5c8f20c18 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -448,6 +448,7 @@ parPartToInlines' (PlainOMath exps) = parPartToInlines' (Field info children) = case info of HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url children + PagerefField fieldAnchor True -> parPartToInlines' $ InternalHyperLink fieldAnchor children _ -> smushInlines <$> mapM parPartToInlines' children parPartToInlines' NullParPart = return mempty diff --git a/src/Text/Pandoc/Readers/Docx/Fields.hs b/src/Text/Pandoc/Readers/Docx/Fields.hs index 442bc3466..5f090b6be 100644 --- a/src/Text/Pandoc/Readers/Docx/Fields.hs +++ b/src/Text/Pandoc/Readers/Docx/Fields.hs @@ -21,8 +21,11 @@ import Text.Parsec import Text.Parsec.Text (Parser) type URL = T.Text +type Anchor = T.Text data FieldInfo = HyperlinkField URL + -- The boolean indicates whether the field is a hyperlink. + | PagerefField Anchor Bool | UnknownField deriving (Show) @@ -33,6 +36,8 @@ fieldInfo :: Parser FieldInfo fieldInfo = try (HyperlinkField <$> hyperlink) <|> + try ((uncurry PagerefField) <$> pageref) + <|> return UnknownField escapedQuote :: Parser T.Text @@ -72,3 +77,23 @@ hyperlink = do ("\\l", s) : _ -> farg <> "#" <> s _ -> farg return url + +-- See §17.16.5.45 +pagerefSwitch :: Parser (T.Text, T.Text) +pagerefSwitch = do + sw <- string "\\h" + spaces + farg <- fieldArgument + return (T.pack sw, farg) + +pageref :: Parser (Anchor, Bool) +pageref = do + many space + string "PAGEREF" + spaces + farg <- fieldArgument + switches <- spaces *> many pagerefSwitch + let isLink = case switches of + ("\\h", _) : _ -> True + _ -> False + return (farg, isLink) diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs index af6023836..ea4094c82 100644 --- a/test/Tests/Readers/Docx.hs +++ b/test/Tests/Readers/Docx.hs @@ -151,6 +151,10 @@ tests = [ testGroup "document" "nested fields with tag" "docx/nested_instrText.docx" "docx/nested_instrText.native" + , testCompare + "pageref hyperlinks in tag" + "docx/pageref.docx" + "docx/pageref.native" , testCompare "inline image" "docx/image.docx" diff --git a/test/docx/0_level_headers.native b/test/docx/0_level_headers.native index ed589b029..7080063f9 100644 --- a/test/docx/0_level_headers.native +++ b/test/docx/0_level_headers.native @@ -39,9 +39,9 @@ []) ,Para [Str "CONTENTS"] ,Para [Strong [Str "Section",Space,Str "Page"]] -,Para [Str "FIGURES",Space,Str "iv"] -,Para [Str "TABLES",Space,Str "v"] -,Para [Str "SECTION",Space,Str "1",Space,Str "Introduction",Space,Str "2"] +,Para [Str "FIGURES",Space,Link ("",[],[]) [Str "iv"] ("#figures","")] +,Para [Str "TABLES",Space,Link ("",[],[]) [Str "v"] ("#tables","")] +,Para [Str "SECTION",Space,Str "1",Space,Str "Introduction",Space,Link ("",[],[]) [Str "2"] ("#introduction","")] ,Header 1 ("figures",["Heading-0"],[]) [Str "FIGURES"] ,Para [Strong [Str "Figure",Space,Str "Page"]] ,Para [Strong [Str "No",Space,Str "table",Space,Str "of",Space,Str "figures",Space,Str "entries",Space,Str "found."]] diff --git a/test/docx/golden/nested_anchors_in_header.docx b/test/docx/golden/nested_anchors_in_header.docx index 88dd21abd..9d89070d9 100644 Binary files a/test/docx/golden/nested_anchors_in_header.docx and b/test/docx/golden/nested_anchors_in_header.docx differ diff --git a/test/docx/nested_anchors_in_header.native b/test/docx/nested_anchors_in_header.native index 314b31663..9cc256d28 100644 --- a/test/docx/nested_anchors_in_header.native +++ b/test/docx/nested_anchors_in_header.native @@ -1,8 +1,8 @@ [Header 1 ("\1086\1075\1083\1072\1074\1083\1077\1085\1080\1077",["TOC-Heading"],[]) [Str "\1054\1075\1083\1072\1074\1083\1077\1085\1080\1077"] -,Para [Link ("",[],[]) [Str "Short",Space,Str "instructions",Space,Str "1"] ("#short-instructions","")] -,Para [Link ("",[],[]) [Str "Some",Space,Str "instructions",Space,Str "1"] ("#some-instructions","")] -,Para [Link ("",[],[]) [Str "Remote",Space,Str "folder",Space,Str "or",Space,Str "longlonglonglonglong",Space,Str "file",Space,Str "with",Space,Str "manymanymanymany",Space,Str "letters",Space,Str "inside",Space,Str "opening",Space,Str "2"] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-opening","")] -,Para [Link ("",[],[]) [Str "Remote",Space,Str "folder",Space,Str "or",Space,Str "longlonglonglonglong",Space,Str "file",Space,Str "with",Space,Str "manymanymanymany",Space,Str "letters",Space,Str "inside",Space,Str "closing",Space,Str "2"] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-closing","")] +,Para [Link ("",[],[]) [Str "Short",Space,Str "instructions",Space,Link ("",[],[]) [Str "1"] ("#short-instructions","")] ("#short-instructions","")] +,Para [Link ("",[],[]) [Str "Some",Space,Str "instructions",Space,Link ("",[],[]) [Str "1"] ("#some-instructions","")] ("#some-instructions","")] +,Para [Link ("",[],[]) [Str "Remote",Space,Str "folder",Space,Str "or",Space,Str "longlonglonglonglong",Space,Str "file",Space,Str "with",Space,Str "manymanymanymany",Space,Str "letters",Space,Str "inside",Space,Str "opening",Space,Link ("",[],[]) [Str "2"] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-opening","")] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-opening","")] +,Para [Link ("",[],[]) [Str "Remote",Space,Str "folder",Space,Str "or",Space,Str "longlonglonglonglong",Space,Str "file",Space,Str "with",Space,Str "manymanymanymany",Space,Str "letters",Space,Str "inside",Space,Str "closing",Space,Link ("",[],[]) [Str "2"] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-closing","")] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-closing","")] ,Header 1 ("short-instructions",[],[]) [Str "Short",Space,Str "instructions"] ,Para [Link ("",[],[]) [Str "Open",Space,Str "remote",Space,Str "folder"] ("#remote-folder-or-longlonglonglonglong-file-with-manymanymanymany-letters-inside-opening","")] ,Para [Str "Do",Space,Str "staff"] diff --git a/test/docx/pageref.docx b/test/docx/pageref.docx new file mode 100644 index 000000000..0a9159cab Binary files /dev/null and b/test/docx/pageref.docx differ diff --git a/test/docx/pageref.native b/test/docx/pageref.native new file mode 100644 index 000000000..6c683de67 --- /dev/null +++ b/test/docx/pageref.native @@ -0,0 +1,4 @@ +[Para [Str "Title",Space,Link ("",[],[]) [Str "2"] ("#title","")] +,Para [Str "Title2",Space,Link ("",[],[]) [Str "2"] ("#title2","")] +,Header 1 ("title", [],[]) [Str "Title"] +,Header 1 ("title2",[],[]) [Str "Title2"]] -- cgit v1.2.3 From 465c28d28e1017040a41653edb6248056f178d3b Mon Sep 17 00:00:00 2001 From: Milan Bracke Date: Thu, 24 Jun 2021 09:27:28 +0200 Subject: Docx reader: fix handling of empty fields Some fields only have an instrText and no content, Pandoc didn't understand these, causing other fields to be misunderstood because it seemed like a field was still open when it wasn't. --- src/Text/Pandoc/Readers/Docx/Parse.hs | 4 ++++ test/Tests/Readers/Docx.hs | 4 ++++ test/docx/empty_field.docx | Bin 0 -> 14312 bytes test/docx/empty_field.native | 7 +++++++ 4 files changed, 15 insertions(+) create mode 100644 test/docx/empty_field.docx create mode 100644 test/docx/empty_field.native (limited to 'test/Tests/Readers') diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index a97d4b3d1..0021741ed 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -844,6 +844,10 @@ elemToParPart ns element FldCharFieldInfo info : ancestors | fldCharType == "separate" -> do modify $ \st -> st {stateFldCharState = FldCharContent info [] : ancestors} return NullParPart + -- Some fields have no content, since Pandoc doesn't understand any of those fields, we can just close it. + FldCharFieldInfo _ : ancestors | fldCharType == "end" -> do + modify $ \st -> st {stateFldCharState = ancestors} + return NullParPart [FldCharContent info children] | fldCharType == "end" -> do modify $ \st -> st {stateFldCharState = []} return $ Field info $ reverse children diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs index ea4094c82..be5b89b88 100644 --- a/test/Tests/Readers/Docx.hs +++ b/test/Tests/Readers/Docx.hs @@ -151,6 +151,10 @@ tests = [ testGroup "document" "nested fields with tag" "docx/nested_instrText.docx" "docx/nested_instrText.native" + , testCompare + "empty fields with tag" + "docx/empty_field.docx" + "docx/empty_field.native" , testCompare "pageref hyperlinks in tag" "docx/pageref.docx" diff --git a/test/docx/empty_field.docx b/test/docx/empty_field.docx new file mode 100644 index 000000000..fb3830b0d Binary files /dev/null and b/test/docx/empty_field.docx differ diff --git a/test/docx/empty_field.native b/test/docx/empty_field.native new file mode 100644 index 000000000..18d892454 --- /dev/null +++ b/test/docx/empty_field.native @@ -0,0 +1,7 @@ +[Para + [Str "\24076\26395\28145\20837\20102\35299\30340\35835\32773\21487\20197\21435\30475David",Space,Str "French",Space,Str "Belding\21644Kevin",Space,Str "J.",Space,Str "Mitchell\30340" + ,Link ("",[],[]) [Str "Foundations",Space,Str "of",Space,Str "Analysis,",Space,Str "2nd",Space,Str "Edition"] ("https://books.google.com/books?id=sp_Zcb9ot90C&lpg=PR4&hl=zh-CN&pg=PA19#v=onepage&q&f=true",""),Str ",\21487\20174\&19\39029\30475\36215\65292\25110D.C.",Space,Str "Goldrei\30340",Space + ,Link ("",[],[]) [Str "Classic",Space,Str "Set",Space,Str "Theory:",Space,Str "For",Space,Str "Guided",Space,Str "Independent",Space,Str "Study"] ("https://books.google.ae/books?id=dlc0DwAAQBAJ&lpg=PT29&hl=zh-CN&pg=PT26#v=onepage&q&f=true","") + ,Str "\65292\20174\31532\20108\31456\30475\36215\65292\38405\35835\26102\35201\27880\24847\26412\25991\19982\36825\20123\20070\25152\19981\21516\30340\26159\24182\27809\26377\25226\23454\25968\30475\20316\26159\26377\29702\25968\38598\30340\20998\21106\12290"] +,Para [Str "Index:"] +,Para [Str "French,",Space,Str "1"]] -- cgit v1.2.3 From c712d13b67a92c887d5ef185064aecf0972d4496 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 22 Oct 2021 22:10:25 -0700 Subject: Org reader: allow an initial :PROPERTIES: drawer to add to metadata. Closes #7520. --- src/Text/Pandoc/Readers/Org/DocumentTree.hs | 12 ++++++++++-- test/Tests/Readers/Org/Meta.hs | 2 +- test/command/7520.md | 22 ++++++++++++++++++++++ 3 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 test/command/7520.md (limited to 'test/Tests/Readers') diff --git a/src/Text/Pandoc/Readers/Org/DocumentTree.hs b/src/Text/Pandoc/Readers/Org/DocumentTree.hs index 2dcbecb1d..1c4f253cc 100644 --- a/src/Text/Pandoc/Readers/Org/DocumentTree.hs +++ b/src/Text/Pandoc/Readers/Org/DocumentTree.hs @@ -41,6 +41,7 @@ documentTree :: PandocMonad m -> OrgParser m (F Inlines) -> OrgParser m (F Headline) documentTree blocks inline = do + properties <- option mempty propertiesDrawer initialBlocks <- blocks headlines <- sequence <$> manyTill (headline blocks inline 1) eof title <- fmap docTitle . orgStateMeta <$> getState @@ -54,7 +55,7 @@ documentTree blocks inline = do , headlineText = B.fromList title' , headlineTags = mempty , headlinePlanning = emptyPlanning - , headlineProperties = mempty + , headlineProperties = properties , headlineContents = initialBlocks' , headlineChildren = headlines' } @@ -163,8 +164,15 @@ unprunedHeadlineToBlocks hdln st = in if not usingSelectedTags || any (`Set.member` orgStateSelectTags st) (headlineTags rootNode') then do headlineBlocks <- headlineToBlocks rootNode' + -- add metadata from root node :PROPERTIES: + updateState $ \s -> + s{ orgStateMeta = foldr + (\(PropertyKey k, PropertyValue v) m -> + B.setMeta k v <$> m) + (orgStateMeta s) + (headlineProperties rootNode') } -- ignore first headline, it's the document's title - return . drop 1 . B.toList $ headlineBlocks + return $ drop 1 $ B.toList headlineBlocks else do headlineBlocks <- mconcat <$> mapM headlineToBlocks (headlineChildren rootNode') return . B.toList $ headlineBlocks diff --git a/test/Tests/Readers/Org/Meta.hs b/test/Tests/Readers/Org/Meta.hs index 6363d84b0..41a41cb00 100644 --- a/test/Tests/Readers/Org/Meta.hs +++ b/test/Tests/Readers/Org/Meta.hs @@ -238,7 +238,7 @@ tests = , " :setting: foo" , " :END:" ] =?> - (mempty::Blocks) + (setMeta "setting" ("foo" :: T.Text) (doc mempty)) , "Logbook drawer" =: T.unlines [ " :LogBook:" diff --git a/test/command/7520.md b/test/command/7520.md new file mode 100644 index 000000000..0300a9e45 --- /dev/null +++ b/test/command/7520.md @@ -0,0 +1,22 @@ +``` +% pandoc -f org -t native -s +:PROPERTIES: +:ID: d5b18943-98a3-4b2a-a545-41d17bf50f3e +:END: +#+title: Common Ground +^D +Pandoc + Meta + { unMeta = + fromList + [ ( "id" + , MetaString "d5b18943-98a3-4b2a-a545-41d17bf50f3e" + ) + , ( "title" + , MetaInlines [ Str "Common" , Space , Str "Ground" ] + ) + ] + } + [] + +``` -- cgit v1.2.3 From 005dc7ce56a4a165fa9af239cc28a2589f7b169d Mon Sep 17 00:00:00 2001 From: willj-dev <94586033+willj-dev@users.noreply.github.com> Date: Thu, 18 Nov 2021 17:33:57 -0800 Subject: RST reader: handle class attribute for for custom roles (#7700) Previously the class attribute was ignored, and the name of the role used as the class. Closes #7699. --- src/Text/Pandoc/Readers/RST.hs | 24 ++++++++++++++++-------- test/Tests/Readers/RST.hs | 9 +++++++++ 2 files changed, 25 insertions(+), 8 deletions(-) (limited to 'test/Tests/Readers') diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index 8ee017342..88471eb0a 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -919,14 +919,22 @@ addNewRole roleText fields = do (baseRole, baseFmt, baseAttr) = getBaseRole (parentRole, Nothing, nullAttr) customRoles fmt = if parentRole == "raw" then lookup "format" fields else baseFmt - annotate :: [Text] -> [Text] - annotate = maybe id (:) $ - if baseRole == "code" - then lookup "language" fields - else Nothing - attr = let (ident, classes, keyValues) = baseAttr - -- nub in case role name & language class are the same - in (ident, nub . (role :) . annotate $ classes, keyValues) + + updateClasses :: [Text] -> [Text] + updateClasses oldClasses = let + + codeLanguageClass = if baseRole == "code" + then maybeToList (lookup "language" fields) + else [] + + -- if no ":class:" field is given, the default is the role name + classFieldClasses = maybe [role] T.words (lookup "class" fields) + + -- nub in case role name & language class are the same + in nub (classFieldClasses ++ codeLanguageClass ++ oldClasses) + + attr = let (ident, baseClasses, keyValues) = baseAttr + in (ident, updateClasses baseClasses, keyValues) -- warn about syntax we ignore forM_ fields $ \(key, _) -> case key of diff --git a/test/Tests/Readers/RST.hs b/test/Tests/Readers/RST.hs index a12b59fc2..e9ab8cc11 100644 --- a/test/Tests/Readers/RST.hs +++ b/test/Tests/Readers/RST.hs @@ -179,6 +179,15 @@ tests = [ "line block with blank line" =: , "custom code role with language field" =: ".. role:: lhs(code)\n :language: haskell\n\n:lhs:`a`" =?> para (codeWith ("", ["lhs", "haskell"], []) "a") + , "custom role with class field" + =: ".. role:: classy\n :class: myclass\n\n:classy:`a`" + =?> para (spanWith ("", ["myclass"], []) "a") + , "custom role with class field containing multiple whitespace-separated classes" + =: ".. role:: classy\n :class: myclass1 myclass2\n myclass3\n\n:classy:`a`" + =?> para (spanWith ("", ["myclass1", "myclass2", "myclass3"], []) "a") + , "custom role with inherited class field" + =: ".. role:: classy\n :class: myclass1\n.. role:: classier(classy)\n :class: myclass2\n\n:classier:`a`" + =?> para (spanWith ("", ["myclass2", "myclass1"], []) "a") , "custom role with unspecified parent role" =: ".. role:: classy\n\n:classy:`text`" =?> para (spanWith ("", ["classy"], []) "text") -- cgit v1.2.3 From 5817e864918e5d03b6402afac0ff8c748a2ac2f6 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Tue, 14 Dec 2021 09:20:09 -0800 Subject: Org reader: remove support for "Berkeley style" citations. See #7329. --- src/Text/Pandoc/Readers/Org/Inlines.hs | 187 +++++++----------------------- test/Tests/Readers/Org/Inline/Citation.hs | 47 -------- 2 files changed, 42 insertions(+), 192 deletions(-) (limited to 'test/Tests/Readers') diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs index 6862dd71e..2366aa290 100644 --- a/src/Text/Pandoc/Readers/Org/Inlines.hs +++ b/src/Text/Pandoc/Readers/Org/Inlines.hs @@ -32,10 +32,9 @@ import Text.TeXMath (DisplayType (..), readTeX, writePandoc) import Text.Pandoc.Sources (ToSources(..)) import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap -import Control.Monad (guard, mplus, mzero, unless, void, when) +import Control.Monad (guard, mplus, mzero, unless, when) import Control.Monad.Trans (lift) import Data.Char (isAlphaNum, isSpace) -import Data.List (intersperse) import qualified Data.Map as M import Data.Text (Text) import qualified Data.Text as T @@ -148,32 +147,57 @@ endline = try $ do -- Citations -- --- The state of citations is a bit confusing due to the lack of an official --- syntax and multiple syntaxes coexisting. The pandocOrgCite syntax was the --- first to be implemented here and is almost identical to Markdown's citation --- syntax. The org-ref package is in wide use to handle citations, but the --- syntax is a bit limiting and not quite as simple to write. The --- semi-official Org-mode citation syntax is based on John MacFarlane's Pandoc --- sytax and Org-oriented enhancements contributed by Richard Lawrence and --- others. It's dubbed Berkeley syntax due the place of activity of its main --- contributors. All this should be consolidated once an official Org-mode --- citation syntax has emerged. +-- We first try to parse official org-cite citations, then fall +-- back to org-ref citations (which are still in wide use). cite :: PandocMonad m => OrgParser m (F Inlines) -cite = try $ berkeleyCite <|> do +cite = try $ do guardEnabled Ext_citations (cs, raw) <- withRaw $ choice - [ pandocOrgCite + [ orgCite , orgRefCite - , berkeleyTextualCite ] return $ flip B.cite (B.text raw) <$> cs --- | A citation in Pandoc Org-mode style (@[prefix \@citekey suffix]@). -pandocOrgCite :: PandocMonad m => OrgParser m (F [Citation]) -pandocOrgCite = try $ +-- | A citation in org-cite style +orgCite :: PandocMonad m => OrgParser m (F [Citation]) +orgCite = try $ char '[' *> skipSpaces *> citeList <* skipSpaces <* char ']' +citeList :: PandocMonad m => OrgParser m (F [Citation]) +citeList = sequence <$> sepEndBy1 citation (try $ char ';' *> skipSpaces) + +citation :: PandocMonad m => OrgParser m (F Citation) +citation = try $ do + pref <- prefix + (suppress_author, key) <- citeKey False + suff <- suffix + return $ do + x <- pref + y <- suff + return Citation + { citationId = key + , citationPrefix = B.toList x + , citationSuffix = B.toList y + , citationMode = if suppress_author + then SuppressAuthor + else NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + where + prefix = trimInlinesF . mconcat <$> + manyTill inline (char ']' <|> (']' <$ lookAhead (citeKey False))) + suffix = try $ do + hasSpace <- option False (notFollowedBy nonspaceChar >> return True) + skipSpaces + rest <- trimInlinesF . mconcat <$> + many (notFollowedBy (oneOf ";]") *> inline) + return $ if hasSpace + then (B.space <>) <$> rest + else rest + + orgRefCite :: PandocMonad m => OrgParser m (F [Citation]) orgRefCite = try $ choice [ normalOrgRefCite @@ -201,100 +225,6 @@ normalOrgRefCite = try $ do , citationHash = 0 } --- | Read an Berkeley-style Org-mode citation. Berkeley citation style was --- develop and adjusted to Org-mode style by John MacFarlane and Richard --- Lawrence, respectively, both philosophers at UC Berkeley. -berkeleyCite :: PandocMonad m => OrgParser m (F Inlines) -berkeleyCite = try $ do - bcl <- berkeleyCitationList - return $ do - parens <- berkeleyCiteParens <$> bcl - prefix <- berkeleyCiteCommonPrefix <$> bcl - suffix <- berkeleyCiteCommonSuffix <$> bcl - citationList <- berkeleyCiteCitations <$> bcl - return $ - if parens - then toCite - . maybe id (alterFirst . prependPrefix) prefix - . maybe id (alterLast . appendSuffix) suffix - $ citationList - else maybe mempty (<> " ") prefix - <> toListOfCites (map toInTextMode citationList) - <> maybe mempty (", " <>) suffix - where - toCite :: [Citation] -> Inlines - toCite cs = B.cite cs mempty - - toListOfCites :: [Citation] -> Inlines - toListOfCites = mconcat . intersperse ", " . map (\c -> B.cite [c] mempty) - - toInTextMode :: Citation -> Citation - toInTextMode c = c { citationMode = AuthorInText } - - alterFirst, alterLast :: (a -> a) -> [a] -> [a] - alterFirst _ [] = [] - alterFirst f (c:cs) = f c : cs - alterLast f = reverse . alterFirst f . reverse - - prependPrefix, appendSuffix :: Inlines -> Citation -> Citation - prependPrefix pre c = c { citationPrefix = B.toList pre <> citationPrefix c } - appendSuffix suf c = c { citationSuffix = citationSuffix c <> B.toList suf } - -data BerkeleyCitationList = BerkeleyCitationList - { berkeleyCiteParens :: Bool - , berkeleyCiteCommonPrefix :: Maybe Inlines - , berkeleyCiteCommonSuffix :: Maybe Inlines - , berkeleyCiteCitations :: [Citation] - } -berkeleyCitationList :: PandocMonad m => OrgParser m (F BerkeleyCitationList) -berkeleyCitationList = try $ do - char '[' - parens <- choice [ False <$ berkeleyBareTag, True <$ berkeleyParensTag ] - char ':' - skipSpaces - commonPrefix <- optionMaybe (try $ citationListPart <* char ';') - citations <- citeList - commonSuffix <- optionMaybe (try citationListPart) - char ']' - return (BerkeleyCitationList parens - <$> sequence commonPrefix - <*> sequence commonSuffix - <*> citations) - where - citationListPart :: PandocMonad m => OrgParser m (F Inlines) - citationListPart = fmap (trimInlinesF . mconcat) . try . many1 $ do - notFollowedBy' $ citeKey False - notFollowedBy (oneOf ";]") - inline - -berkeleyBareTag :: PandocMonad m => OrgParser m () -berkeleyBareTag = try $ void berkeleyBareTag' - -berkeleyParensTag :: PandocMonad m => OrgParser m () -berkeleyParensTag = try . void $ enclosedByPair1 '(' ')' berkeleyBareTag' - -berkeleyBareTag' :: PandocMonad m => OrgParser m () -berkeleyBareTag' = try $ void (string "cite") - -berkeleyTextualCite :: PandocMonad m => OrgParser m (F [Citation]) -berkeleyTextualCite = try $ do - (suppressAuthor, key) <- citeKey False - returnF . return $ Citation - { citationId = key - , citationPrefix = mempty - , citationSuffix = mempty - , citationMode = if suppressAuthor then SuppressAuthor else AuthorInText - , citationNoteNum = 0 - , citationHash = 0 - } - --- The following is what a Berkeley-style bracketed textual citation parser --- would look like. However, as these citations are a subset of Pandoc's Org --- citation style, this isn't used. --- berkeleyBracketedTextualCite :: PandocMonad m => OrgParser m (F [Citation]) --- berkeleyBracketedTextualCite = try . (fmap head) $ --- enclosedByPair1 '[' ']' berkeleyTextualCite - -- | Read a link-like org-ref style citation. The citation includes pre and -- post text. However, multiple citations are not possible due to limitations -- in the syntax. @@ -345,39 +275,6 @@ orgRefCiteMode = , ("citeyear", SuppressAuthor) ] -citeList :: PandocMonad m => OrgParser m (F [Citation]) -citeList = sequence <$> sepEndBy1 citation (try $ char ';' *> skipSpaces) - -citation :: PandocMonad m => OrgParser m (F Citation) -citation = try $ do - pref <- prefix - (suppress_author, key) <- citeKey False - suff <- suffix - return $ do - x <- pref - y <- suff - return Citation - { citationId = key - , citationPrefix = B.toList x - , citationSuffix = B.toList y - , citationMode = if suppress_author - then SuppressAuthor - else NormalCitation - , citationNoteNum = 0 - , citationHash = 0 - } - where - prefix = trimInlinesF . mconcat <$> - manyTill inline (char ']' <|> (']' <$ lookAhead (citeKey False))) - suffix = try $ do - hasSpace <- option False (notFollowedBy nonspaceChar >> return True) - skipSpaces - rest <- trimInlinesF . mconcat <$> - many (notFollowedBy (oneOf ";]") *> inline) - return $ if hasSpace - then (B.space <>) <$> rest - else rest - footnote :: PandocMonad m => OrgParser m (F Inlines) footnote = try $ do note <- inlineNote <|> referencedNote diff --git a/test/Tests/Readers/Org/Inline/Citation.hs b/test/Tests/Readers/Org/Inline/Citation.hs index a11804983..7eabd9aae 100644 --- a/test/Tests/Readers/Org/Inline/Citation.hs +++ b/test/Tests/Readers/Org/Inline/Citation.hs @@ -169,53 +169,6 @@ tests = in (para $ cite [citation] "[[citep:Dominik201408][See page 20::, for example]]") ] - , testGroup "Berkeley-style citations" $ - let pandocCite = Citation - { citationId = "Pandoc" - , citationPrefix = mempty - , citationSuffix = mempty - , citationMode = NormalCitation - , citationNoteNum = 0 - , citationHash = 0 - } - pandocInText = pandocCite { citationMode = AuthorInText } - dominikCite = Citation - { citationId = "Dominik201408" - , citationPrefix = mempty - , citationSuffix = mempty - , citationMode = NormalCitation - , citationNoteNum = 0 - , citationHash = 0 - } - dominikInText = dominikCite { citationMode = AuthorInText } - in - [ "Berkeley-style in-text citation" =: - "See @Dominik201408." =?> - para ("See " - <> cite [dominikInText] "@Dominik201408" - <> ".") - - , "Berkeley-style parenthetical citation list" =: - "[(cite): see; @Dominik201408;also @Pandoc; and others]" =?> - let pandocCite' = pandocCite { - citationPrefix = toList "also" - , citationSuffix = toList "and others" - } - dominikCite' = dominikCite { - citationPrefix = toList "see" - } - in (para $ cite [dominikCite', pandocCite'] "") - - , "Berkeley-style plain citation list" =: - "[cite: See; @Dominik201408; and @Pandoc; and others]" =?> - let pandocCite' = pandocInText { citationPrefix = toList "and" } - in (para $ "See " - <> cite [dominikInText] "" - <> "," <> space - <> cite [pandocCite'] "" - <> "," <> space <> "and others") - ] - , "LaTeX citation" =: "\\cite{Coffee}" =?> let citation = Citation -- cgit v1.2.3 From 394fa9d0727a30f540d9c36ccfa68fc942cad587 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Tue, 14 Dec 2021 10:40:24 -0800 Subject: Org reader: parse official org-cite citations. We also support the older org-ref style as a fallback. We no longer support the "markdown-style" citations. See #7329. --- src/Text/Pandoc/Readers/Org/Inlines.hs | 199 ++++++++++++++++++++++++------ test/Tests/Readers/Org/Inline/Citation.hs | 40 +++++- test/command/7329.md | 23 ++++ 3 files changed, 218 insertions(+), 44 deletions(-) (limited to 'test/Tests/Readers') diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs index 2366aa290..617f98a10 100644 --- a/src/Text/Pandoc/Readers/Org/Inlines.hs +++ b/src/Text/Pandoc/Readers/Org/Inlines.hs @@ -31,8 +31,8 @@ import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline) import Text.TeXMath (DisplayType (..), readTeX, writePandoc) import Text.Pandoc.Sources (ToSources(..)) import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap - -import Control.Monad (guard, mplus, mzero, unless, when) +import Safe (lastMay) +import Control.Monad (guard, mplus, mzero, unless, when, void) import Control.Monad.Trans (lift) import Data.Char (isAlphaNum, isSpace) import qualified Data.Map as M @@ -150,53 +150,174 @@ endline = try $ do -- We first try to parse official org-cite citations, then fall -- back to org-ref citations (which are still in wide use). -cite :: PandocMonad m => OrgParser m (F Inlines) -cite = try $ do - guardEnabled Ext_citations - (cs, raw) <- withRaw $ choice - [ orgCite - , orgRefCite - ] - return $ flip B.cite (B.text raw) <$> cs - -- | A citation in org-cite style orgCite :: PandocMonad m => OrgParser m (F [Citation]) -orgCite = try $ - char '[' *> skipSpaces *> citeList <* skipSpaces <* char ']' - -citeList :: PandocMonad m => OrgParser m (F [Citation]) -citeList = sequence <$> sepEndBy1 citation (try $ char ';' *> skipSpaces) - -citation :: PandocMonad m => OrgParser m (F Citation) -citation = try $ do - pref <- prefix - (suppress_author, key) <- citeKey False - suff <- suffix +orgCite = try $ do + string "[cite" + (sty, _variants) <- citeStyle + char ':' + spnl + globalPref <- option mempty (try (citePrefix <* char ';')) + items <- citeItems + globalSuff <- option mempty (try (char ';' *> citeSuffix)) + spnl + char ']' + return $ adjustCiteStyle sty . + addPrefixToFirstItem globalPref . + addSuffixToLastItem globalSuff $ items + +adjustCiteStyle :: CiteStyle -> (F [Citation]) -> (F [Citation]) +adjustCiteStyle sty cs = do + cs' <- cs + case cs' of + [] -> return [] + (d:ds) -- TODO needs refinement + -> case sty of + TextStyle -> return $ d{ citationMode = AuthorInText + , citationSuffix = dropWhile (== Space) + (citationSuffix d)} : ds + NoAuthorStyle -> return $ d{ citationMode = SuppressAuthor } : ds + _ -> return (d:ds) + +addPrefixToFirstItem :: (F Inlines) -> (F [Citation]) -> (F [Citation]) +addPrefixToFirstItem aff cs = do + cs' <- cs + aff' <- aff + case cs' of + [] -> return [] + (d:ds) -> return (d{ citationPrefix = + B.toList aff' <> citationPrefix d }:ds) + +addSuffixToLastItem :: (F Inlines) -> (F [Citation]) -> (F [Citation]) +addSuffixToLastItem aff cs = do + cs' <- cs + aff' <- aff + case lastMay cs' of + Nothing -> return cs' + Just d -> + return (init cs' ++ [d{ citationSuffix = + citationSuffix d <> B.toList aff' }]) + +citeItems :: PandocMonad m => OrgParser m (F [Citation]) +citeItems = sequence <$> citeItem `sepBy1` (char ';') + +citeItem :: PandocMonad m => OrgParser m (F Citation) +citeItem = do + pref <- citePrefix + itemKey <- orgCiteKey + suff <- citeSuffix return $ do - x <- pref - y <- suff + pre' <- pref + suf' <- suff return Citation - { citationId = key - , citationPrefix = B.toList x - , citationSuffix = B.toList y - , citationMode = if suppress_author - then SuppressAuthor - else NormalCitation + { citationId = itemKey + , citationPrefix = B.toList pre' + , citationSuffix = B.toList suf' + , citationMode = NormalCitation , citationNoteNum = 0 , citationHash = 0 } + +orgCiteKey :: PandocMonad m => OrgParser m Text +orgCiteKey = do + char '@' + T.pack <$> many1 (satisfy orgCiteKeyChar) + +orgCiteKeyChar :: Char -> Bool +orgCiteKeyChar c = + isAlphaNum c || c `elem` ['.',':','?','!','`','\'','/','*','@','+','|', + '(',')','{','}','<','>','&','_','^','$','#', + '%','~','-'] + +rawAffix :: PandocMonad m => Bool -> OrgParser m Text +rawAffix isPrefix = snd <$> withRaw + (many + (affixChar + <|> + try (void (char '[' >> rawAffix isPrefix >> char ']')))) where - prefix = trimInlinesF . mconcat <$> - manyTill inline (char ']' <|> (']' <$ lookAhead (citeKey False))) - suffix = try $ do - hasSpace <- option False (notFollowedBy nonspaceChar >> return True) - skipSpaces - rest <- trimInlinesF . mconcat <$> - many (notFollowedBy (oneOf ";]") *> inline) + affixChar = void $ satisfy $ \c -> + not (c == '^' || c == ';' || c == '[' || c == ']') && + (not isPrefix || c /= '@') + +citePrefix :: PandocMonad m => OrgParser m (F Inlines) +citePrefix = + rawAffix True >>= parseFromString (trimInlinesF . mconcat <$> many inline) + +citeSuffix :: PandocMonad m => OrgParser m (F Inlines) +citeSuffix = + rawAffix False >>= parseFromString parseSuffix + where + parseSuffix = do + hasSpace <- option False + (True <$ try (spaceChar >> skipSpaces >> lookAhead nonspaceChar)) + ils <- trimInlinesF . mconcat <$> many inline return $ if hasSpace - then (B.space <>) <$> rest - else rest + then (B.space <>) <$> ils + else ils + +citeStyle :: PandocMonad m => OrgParser m (CiteStyle, [CiteVariant]) +citeStyle = option (DefStyle, []) $ do + sty <- option DefStyle $ try $ char '/' *> orgCiteStyle + variants <- option [] $ try $ char '/' *> orgCiteVariants + return (sty, variants) + +orgCiteStyle :: PandocMonad m => OrgParser m CiteStyle +orgCiteStyle = choice $ map try + [ NoAuthorStyle <$ string "noauthor" + , NoAuthorStyle <$ string "na" + , LocatorsStyle <$ string "locators" + , LocatorsStyle <$ char 'l' + , NociteStyle <$ string "nocite" + , NociteStyle <$ char 'n' + , TextStyle <$ string "text" + , TextStyle <$ char 't' + ] + +orgCiteVariants :: PandocMonad m => OrgParser m [CiteVariant] +orgCiteVariants = + (fullnameVariant `sepBy1` (char '-')) <|> (many1 onecharVariant) + where + fullnameVariant = choice $ map try + [ Bare <$ string "bare" + , Caps <$ string "caps" + , Full <$ string "full" + ] + onecharVariant = choice + [ Bare <$ char 'b' + , Caps <$ char 'c' + , Full <$ char 'f' + ] + +data CiteStyle = + NoAuthorStyle + | LocatorsStyle + | NociteStyle + | TextStyle + | DefStyle + deriving Show + +data CiteVariant = + Caps + | Bare + | Full + deriving Show + + +spnl :: PandocMonad m => OrgParser m () +spnl = + skipSpaces *> optional (newline *> notFollowedBy blankline *> skipSpaces) + +cite :: PandocMonad m => OrgParser m (F Inlines) +cite = do + guardEnabled Ext_citations + (cs, raw) <- withRaw $ try $ choice + [ orgCite + , orgRefCite + ] + return $ flip B.cite (B.text raw) <$> cs +-- org-ref orgRefCite :: PandocMonad m => OrgParser m (F [Citation]) orgRefCite = try $ choice diff --git a/test/Tests/Readers/Org/Inline/Citation.hs b/test/Tests/Readers/Org/Inline/Citation.hs index 7eabd9aae..2d0d460a2 100644 --- a/test/Tests/Readers/Org/Inline/Citation.hs +++ b/test/Tests/Readers/Org/Inline/Citation.hs @@ -19,9 +19,9 @@ import Text.Pandoc.Builder tests :: [TestTree] tests = - [ testGroup "Markdown-style citations" + [ testGroup "Org-cite citations" [ "Citation" =: - "[@nonexistent]" =?> + "[cite:@nonexistent]" =?> let citation = Citation { citationId = "nonexistent" , citationPrefix = [] @@ -29,10 +29,10 @@ tests = , citationMode = NormalCitation , citationNoteNum = 0 , citationHash = 0} - in (para $ cite [citation] "[@nonexistent]") + in (para $ cite [citation] "[cite:@nonexistent]") , "Citation containing text" =: - "[see @item1 p. 34-35]" =?> + "[cite:see @item1 p. 34-35]" =?> let citation = Citation { citationId = "item1" , citationPrefix = [Str "see"] @@ -40,7 +40,37 @@ tests = , citationMode = NormalCitation , citationNoteNum = 0 , citationHash = 0} - in (para $ cite [citation] "[see @item1 p. 34-35]") + in (para $ cite [citation] "[cite:see @item1 p. 34-35]") + + , "Author-in-text citation with locator and suffix" =: + "[cite/t:see @item1 p. 34-35 and *passim*; @item2]" =?> + let citations = + [ Citation + { citationId = "item1" + , citationPrefix = [ Str "see" ] + , citationSuffix = + [ Str "p." + , Space + , Str "34-35" + , Space + , Str "and" + , Space + , Strong [ Str "passim" ] + ] + , citationMode = AuthorInText + , citationNoteNum = 0 + , citationHash = 0 + } + , Citation + { citationId = "item2" + , citationPrefix = [] + , citationSuffix = [] + , citationMode = NormalCitation + , citationNoteNum = 0 + , citationHash = 0 + } + ] + in (para $ cite citations "[cite/t:see @item1 p. 34-35 and *passim*; @item2]") ] , testGroup "org-ref citations" diff --git a/test/command/7329.md b/test/command/7329.md index 565241db8..9ed9c52ff 100644 --- a/test/command/7329.md +++ b/test/command/7329.md @@ -42,3 +42,26 @@ <> Doe, John. 2005. /First Book/. Cambridge: Cambridge University Press. ``` + +``` +% pandoc -f org -t markdown +- [cite/t:@item1] +- [cite/t:@item1 p. 12] +- [cite/t:@item1 p.12; see also @item2] +- [cite:@item1] +- [cite/na:@item1] +- [cite:see @item1 p. 12] +- [cite:see @item1 p. 12 and /passim/] +- [cite:@item1; @item2] +- [cite:see @item1; @item2] +^D +- @item1 +- @item1 [p. 12] +- @item1 [p.12; see also @item2] +- [@item1] +- [-@item1] +- [see @item1 p. 12] +- [see @item1 p. 12 and *passim*] +- [@item1; @item2] +- [see @item1; @item2] +``` -- cgit v1.2.3