From 3c4a58bad03ef56ae9c82b7e7a6ae027514e2bd6 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Wed, 10 Feb 2021 23:01:08 -0800
Subject: T.P.Class: Add getTimestamp [API change].

This attempts to read the SOURCE_DATE_EPOCH environment variable
and parse a UTC time from it (treating it as a unix date stamp,
see https://reproducible-builds.org/specs/source-date-epoch/).
If the variable is not set or can't be parsed as a unix date
stamp, then the function returns the current date.
---
 src/Text/Pandoc/Class/PandocMonad.hs | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index 374da161b..86c8de79e 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -51,6 +51,7 @@ module Text.Pandoc.Class.PandocMonad
   , setTranslations
   , translateTerm
   , makeCanonical
+  , getTimestamp
   ) where
 
 import Codec.Archive.Zip
@@ -59,7 +60,8 @@ import Control.Monad.Except (MonadError (catchError, throwError),
 import Data.Digest.Pure.SHA (sha1, showDigest)
 import Data.Maybe (fromMaybe)
 import Data.Time (UTCTime)
-import Data.Time.Clock.POSIX (POSIXTime, utcTimeToPOSIXSeconds)
+import Data.Time.Clock.POSIX (POSIXTime, utcTimeToPOSIXSeconds,
+                             posixSecondsToUTCTime)
 import Data.Time.LocalTime (TimeZone, ZonedTime, utcToZonedTime)
 import Network.URI ( escapeURIString, nonStrictRelativeTo,
                      unEscapeString, parseURIReference, isAllowedInURI,
@@ -74,7 +76,7 @@ import Text.Pandoc.Error
 import Text.Pandoc.Logging
 import Text.Pandoc.MIME (MimeType, getMimeType, extensionFromMimeType)
 import Text.Pandoc.MediaBag (MediaBag, lookupMedia)
-import Text.Pandoc.Shared (uriPathToPath)
+import Text.Pandoc.Shared (uriPathToPath, safeRead)
 import Text.Pandoc.Translations (Term(..), Translations, lookupTerm,
                                  readTranslations)
 import Text.Pandoc.Walk (walkM)
@@ -175,6 +177,21 @@ report msg = do
   when (level <= verbosity) $ logOutput msg
   modifyCommonState $ \st -> st{ stLog = msg : stLog st }
 
+-- | Get the time from the @SOURCE_DATE_EPOCH@
+-- environment variable. The variable should contain a
+-- unix time stamp, the number of seconds since midnight Jan 01
+-- 1970 UTC.  If the variable is not set or cannot be
+-- parsed as a unix time stamp, the current time is returned.
+-- This function is designed to make possible reproducible
+-- builds in formats that include a creation timestamp.
+getTimestamp :: PandocMonad m => m UTCTime
+getTimestamp = do
+  mbSourceDateEpoch <- lookupEnv "SOURCE_DATE_EPOCH"
+  case mbSourceDateEpoch >>= safeRead of
+    Just (epoch :: Integer) ->
+      return $ posixSecondsToUTCTime $ fromIntegral epoch
+    Nothing -> getCurrentTime
+
 -- | Determine whether tracing is enabled.  This affects
 -- the behavior of 'trace'.  If tracing is not enabled,
 -- 'trace' does nothing.
-- 
cgit v1.2.3


From 80fde18fb1d983b938476ed5b3771ed5d6158d44 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Mon, 22 Feb 2021 11:30:07 -0800
Subject: Text.Pandoc.UTF8: change IO functions to return Text, not String.

[API change] This affects `readFile`, `getContents`, `writeFileWith`,
`writeFile`, `putStrWith`, `putStr`, `putStrLnWith`, `putStrLn`.
`hPutStrWith`, `hPutStr`, `hPutStrLnWith`, `hPutStrLn`, `hGetContents`.

This avoids the need to uselessly create a linked list of characters
when emiting output.
---
 src/Text/Pandoc/App.hs                    | 10 ++++----
 src/Text/Pandoc/App/CommandLineOptions.hs | 38 +++++++++++++++-------------
 src/Text/Pandoc/App/OutputSettings.hs     |  4 +--
 src/Text/Pandoc/Class/IO.hs               |  6 ++---
 src/Text/Pandoc/Error.hs                  |  2 +-
 src/Text/Pandoc/PDF.hs                    | 25 +++++++++---------
 src/Text/Pandoc/Parsing.hs                |  2 +-
 src/Text/Pandoc/UTF8.hs                   | 42 +++++++++++++++----------------
 test/Tests/Command.hs                     |  2 +-
 test/Tests/Old.hs                         |  5 ++--
 test/Tests/Readers/LaTeX.hs               |  2 +-
 11 files changed, 70 insertions(+), 68 deletions(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index 6a071ad5a..63996828e 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -73,8 +73,9 @@ convertWithOpts opts = do
   let verbosity = optVerbosity opts
 
   when (optDumpArgs opts) $
-    do UTF8.hPutStrLn stdout outputFile
-       mapM_ (UTF8.hPutStrLn stdout) (fromMaybe ["-"] $ optInputFiles opts)
+    do UTF8.hPutStrLn stdout (T.pack outputFile)
+       mapM_ (UTF8.hPutStrLn stdout . T.pack)
+             (fromMaybe ["-"] $ optInputFiles opts)
        exitSuccess
 
   let sources = case optInputFiles opts of
@@ -354,6 +355,5 @@ writeFnBinary "-" = liftIO . BL.putStr
 writeFnBinary f   = liftIO . BL.writeFile (UTF8.encodePath f)
 
 writerFn :: MonadIO m => IO.Newline -> FilePath -> Text -> m ()
--- TODO this implementation isn't maximally efficient:
-writerFn eol "-" = liftIO . UTF8.putStrWith eol . T.unpack
-writerFn eol f   = liftIO . UTF8.writeFileWith eol f . T.unpack
+writerFn eol "-" = liftIO . UTF8.putStrWith eol
+writerFn eol f   = liftIO . UTF8.writeFileWith eol f
diff --git a/src/Text/Pandoc/App/CommandLineOptions.hs b/src/Text/Pandoc/App/CommandLineOptions.hs
index a4c510d97..0a8193f6c 100644
--- a/src/Text/Pandoc/App/CommandLineOptions.hs
+++ b/src/Text/Pandoc/App/CommandLineOptions.hs
@@ -812,10 +812,10 @@ options =
                            map (\c -> ['-',c]) shorts ++
                            map ("--" ++) longs
                      let allopts = unwords (concatMap optnames options)
-                     UTF8.hPutStrLn stdout $ printf tpl allopts
-                         (unwords readersNames)
-                         (unwords writersNames)
-                         (unwords $ map (T.unpack . fst) highlightingStyles)
+                     UTF8.hPutStrLn stdout $ T.pack $ printf tpl allopts
+                         (T.unpack $ T.unwords readersNames)
+                         (T.unpack $ T.unwords writersNames)
+                         (T.unpack $ T.unwords $ map fst highlightingStyles)
                          (unwords datafiles)
                      exitSuccess ))
                  "" -- "Print bash completion script"
@@ -854,7 +854,7 @@ options =
                                else if extensionEnabled x allExts
                                        then '-'
                                        else ' ') : drop 4 (show x)
-                     mapM_ (UTF8.hPutStrLn stdout . showExt)
+                     mapM_ (UTF8.hPutStrLn stdout . T.pack . showExt)
                        [ex | ex <- extList, extensionEnabled ex allExts]
                      exitSuccess )
                   "FORMAT")
@@ -868,14 +868,14 @@ options =
                                  , sShortname s `notElem`
                                     [T.pack "Alert", T.pack "Alert_indent"]
                                  ]
-                     mapM_ (UTF8.hPutStrLn stdout) (sort langs)
+                     mapM_ (UTF8.hPutStrLn stdout . T.pack) (sort langs)
                      exitSuccess ))
                  ""
 
     , Option "" ["list-highlight-styles"]
                  (NoArg
                   (\_ -> do
-                     mapM_ (UTF8.hPutStrLn stdout . T.unpack . fst) highlightingStyles
+                     mapM_ (UTF8.hPutStrLn stdout . fst) highlightingStyles
                      exitSuccess ))
                  ""
 
@@ -893,7 +893,7 @@ options =
                             | T.null t -> -- e.g. for docx, odt, json:
                                 E.throwIO $ PandocCouldNotFindDataFileError $ T.pack
                                   ("templates/default." ++ arg)
-                            | otherwise -> write . T.unpack $ t
+                            | otherwise -> write t
                           Left e  -> E.throwIO e
                      exitSuccess)
                   "FORMAT")
@@ -940,11 +940,13 @@ options =
                   (\_ -> do
                      prg <- getProgName
                      defaultDatadirs <- defaultUserDataDirs
-                     UTF8.hPutStrLn stdout (prg ++ " " ++ T.unpack pandocVersion ++
-                       compileInfo ++
-                       "\nUser data directory: " ++
-                       intercalate " or " defaultDatadirs ++
-                       ('\n':copyrightMessage))
+                     UTF8.hPutStrLn stdout
+                      $ T.pack
+                      $ prg ++ " " ++ T.unpack pandocVersion ++
+                        compileInfo ++
+                        "\nUser data directory: " ++
+                        intercalate " or " defaultDatadirs ++
+                        ('\n':copyrightMessage)
                      exitSuccess ))
                  "" -- "Print version"
 
@@ -952,7 +954,7 @@ options =
                  (NoArg
                   (\_ -> do
                      prg <- getProgName
-                     UTF8.hPutStr stdout (usageMessage prg options)
+                     UTF8.hPutStr stdout (T.pack $ usageMessage prg options)
                      exitSuccess ))
                  "" -- "Show help"
     ]
@@ -1013,12 +1015,12 @@ handleUnrecognizedOption "-R" = handleUnrecognizedOption "--parse-raw"
 handleUnrecognizedOption x =
   (("Unknown option " ++ x ++ ".") :)
 
-readersNames :: [String]
-readersNames = sort (map (T.unpack . fst) (readers :: [(Text, Reader PandocIO)]))
+readersNames :: [Text]
+readersNames = sort (map fst (readers :: [(Text, Reader PandocIO)]))
 
-writersNames :: [String]
+writersNames :: [Text]
 writersNames = sort
-  ("pdf" : map (T.unpack . fst) (writers :: [(Text, Writer PandocIO)]))
+  ("pdf" : map fst (writers :: [(Text, Writer PandocIO)]))
 
 splitField :: String -> (String, String)
 splitField = second (tailDef "true") . break (`elemText` ":=")
diff --git a/src/Text/Pandoc/App/OutputSettings.hs b/src/Text/Pandoc/App/OutputSettings.hs
index 53c7d82ef..3864ab188 100644
--- a/src/Text/Pandoc/App/OutputSettings.hs
+++ b/src/Text/Pandoc/App/OutputSettings.hs
@@ -59,8 +59,8 @@ optToOutputSettings opts = do
   let outputFile = fromMaybe "-" (optOutputFile opts)
 
   when (optDumpArgs opts) . liftIO $ do
-    UTF8.hPutStrLn stdout outputFile
-    mapM_ (UTF8.hPutStrLn stdout) (fromMaybe [] $ optInputFiles opts)
+    UTF8.hPutStrLn stdout (T.pack outputFile)
+    mapM_ (UTF8.hPutStrLn stdout . T.pack) (fromMaybe [] $ optInputFiles opts)
     exitSuccess
 
   epubMetadata <- traverse readUtf8File $ optEpubMetadata opts
diff --git a/src/Text/Pandoc/Class/IO.hs b/src/Text/Pandoc/Class/IO.hs
index eecda5711..bb4e2b732 100644
--- a/src/Text/Pandoc/Class/IO.hs
+++ b/src/Text/Pandoc/Class/IO.hs
@@ -183,7 +183,7 @@ getModificationTime = liftIOError System.Directory.getModificationTime
 logOutput :: (PandocMonad m, MonadIO m) => LogMessage -> m ()
 logOutput msg = liftIO $ do
   UTF8.hPutStr stderr $
-      "[" ++ show (messageVerbosity msg) ++ "] "
+      "[" <> T.pack (show (messageVerbosity msg)) <> "] "
   alertIndent $ T.lines $ showLogMessage msg
 
 -- | Prints the list of lines to @stderr@, indenting every but the first
@@ -191,10 +191,10 @@ logOutput msg = liftIO $ do
 alertIndent :: [Text] -> IO ()
 alertIndent [] = return ()
 alertIndent (l:ls) = do
-  UTF8.hPutStrLn stderr $ unpack l
+  UTF8.hPutStrLn stderr l
   mapM_ go ls
   where go l' = do UTF8.hPutStr stderr "  "
-                   UTF8.hPutStrLn stderr $ unpack l'
+                   UTF8.hPutStrLn stderr l'
 
 -- | Extract media from the mediabag into a directory.
 extractMedia :: (PandocMonad m, MonadIO m) => FilePath -> Pandoc -> m Pandoc
diff --git a/src/Text/Pandoc/Error.hs b/src/Text/Pandoc/Error.hs
index 94c013cdb..0fdb7bfe5 100644
--- a/src/Text/Pandoc/Error.hs
+++ b/src/Text/Pandoc/Error.hs
@@ -191,6 +191,6 @@ handleError (Left e) =
 
 err :: Int -> Text -> IO a
 err exitCode msg = do
-  UTF8.hPutStrLn stderr (T.unpack msg)
+  UTF8.hPutStrLn stderr msg
   exitWith $ ExitFailure exitCode
   return undefined
diff --git a/src/Text/Pandoc/PDF.hs b/src/Text/Pandoc/PDF.hs
index 3f9dd8dad..6f462aad5 100644
--- a/src/Text/Pandoc/PDF.hs
+++ b/src/Text/Pandoc/PDF.hs
@@ -270,7 +270,7 @@ missingCharacterWarnings verbosity log' = do
         | isAscii c   = T.singleton c
         | otherwise   = T.pack $ c : " (U+" ++ printf "%04X" (ord c) ++ ")"
   let addCodePoint = T.concatMap toCodePoint
-  let warnings = [ addCodePoint (T.pack $ utf8ToString (BC.drop 19 l))
+  let warnings = [ addCodePoint (utf8ToText (BC.drop 19 l))
                  | l <- ls
                  , isMissingCharacterWarning l
                  ]
@@ -314,7 +314,7 @@ runTectonic verbosity program args' tmpDir' source = do
     env <- liftIO getEnvironment
     when (verbosity >= INFO) $ liftIO $
       showVerboseInfo (Just tmpDir) program programArgs env
-         (utf8ToString sourceBL)
+         (utf8ToText sourceBL)
     (exit, out) <- liftIO $ E.catch
       (pipeProcess (Just env) program programArgs sourceBL)
       (handlePDFProgramNotFound program)
@@ -385,7 +385,7 @@ runTeXProgram verbosity program args numRuns tmpDir' source = do
             (pipeProcess (Just env'') program programArgs BL.empty)
             (handlePDFProgramNotFound program)
           when (verbosity >= INFO) $ liftIO $ do
-            UTF8.hPutStrLn stderr $ "[makePDF] Run #" ++ show runNumber
+            UTF8.hPutStrLn stderr $ "[makePDF] Run #" <> tshow runNumber
             BL.hPutStr stderr out
             UTF8.hPutStr stderr "\n"
           if runNumber < numRuns
@@ -405,7 +405,7 @@ generic2pdf :: Verbosity
 generic2pdf verbosity program args source = do
   env' <- getEnvironment
   when (verbosity >= INFO) $
-    showVerboseInfo Nothing program args env' (T.unpack source)
+    showVerboseInfo Nothing program args env' source
   (exit, out) <- E.catch
     (pipeProcess (Just env') program args
                      (BL.fromStrict $ UTF8.fromText source))
@@ -494,19 +494,20 @@ showVerboseInfo :: Maybe FilePath
                 -> String
                 -> [String]
                 -> [(String, String)]
-                -> String
+                -> Text
                 -> IO ()
 showVerboseInfo mbTmpDir program programArgs env source = do
   case mbTmpDir of
     Just tmpDir -> do
       UTF8.hPutStrLn stderr "[makePDF] temp dir:"
-      UTF8.hPutStrLn stderr tmpDir
+      UTF8.hPutStrLn stderr (T.pack tmpDir)
     Nothing -> return ()
   UTF8.hPutStrLn stderr "[makePDF] Command line:"
-  UTF8.hPutStrLn stderr $ program ++ " " ++ unwords (map show programArgs)
+  UTF8.hPutStrLn stderr $
+       T.pack program <> " " <> T.pack (unwords (map show programArgs))
   UTF8.hPutStr stderr "\n"
   UTF8.hPutStrLn stderr "[makePDF] Environment:"
-  mapM_ (UTF8.hPutStrLn stderr . show) env
+  mapM_ (UTF8.hPutStrLn stderr . tshow) env
   UTF8.hPutStr stderr "\n"
   UTF8.hPutStrLn stderr "[makePDF] Source:"
   UTF8.hPutStrLn stderr source
@@ -517,8 +518,8 @@ handlePDFProgramNotFound program e
       E.throwIO $ PandocPDFProgramNotFoundError $ T.pack program
   | otherwise = E.throwIO e
 
-utf8ToString :: ByteString -> String
-utf8ToString lbs =
+utf8ToText :: ByteString -> Text
+utf8ToText lbs =
   case decodeUtf8' lbs of
-    Left _  -> BC.unpack lbs  -- if decoding fails, treat as latin1
-    Right t -> TL.unpack t
+    Left _  -> T.pack $ BC.unpack lbs  -- if decoding fails, treat as latin1
+    Right t -> TL.toStrict t
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 4c4dd531d..8d3799c3e 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -1148,7 +1148,7 @@ testStringWith :: Show a
                => ParserT Text ParserState Identity a
                -> Text
                -> IO ()
-testStringWith parser str = UTF8.putStrLn $ show $
+testStringWith parser str = UTF8.putStrLn $ tshow $
                             readWith parser defaultParserState str
 
 -- | Parsing options.
diff --git a/src/Text/Pandoc/UTF8.hs b/src/Text/Pandoc/UTF8.hs
index 567f5abe5..4d5921faf 100644
--- a/src/Text/Pandoc/UTF8.hs
+++ b/src/Text/Pandoc/UTF8.hs
@@ -39,67 +39,65 @@ where
 
 import qualified Data.ByteString.Char8 as B
 import qualified Data.ByteString.Lazy.Char8 as BL
+import Data.Text (Text)
 import qualified Data.Text as T
+import qualified Data.Text.IO as TIO
 import qualified Data.Text.Encoding as T
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as TL
 import Prelude hiding (getContents, putStr, putStrLn, readFile, writeFile)
 import System.IO hiding (getContents, hGetContents, hPutStr, hPutStrLn, putStr,
                   putStrLn, readFile, writeFile)
-import qualified System.IO as IO
 
-readFile :: FilePath -> IO String
+readFile :: FilePath -> IO Text
 readFile f = do
   h <- openFile (encodePath f) ReadMode
   hGetContents h
 
-getContents :: IO String
+getContents :: IO Text
 getContents = hGetContents stdin
 
-writeFileWith :: Newline -> FilePath -> String -> IO ()
+writeFileWith :: Newline -> FilePath -> Text -> IO ()
 writeFileWith eol f s =
   withFile (encodePath f) WriteMode $ \h -> hPutStrWith eol h s
 
-writeFile :: FilePath -> String -> IO ()
+writeFile :: FilePath -> Text -> IO ()
 writeFile = writeFileWith nativeNewline
 
-putStrWith :: Newline -> String -> IO ()
+putStrWith :: Newline -> Text -> IO ()
 putStrWith eol s = hPutStrWith eol stdout s
 
-putStr :: String -> IO ()
+putStr :: Text -> IO ()
 putStr = putStrWith nativeNewline
 
-putStrLnWith :: Newline -> String -> IO ()
+putStrLnWith :: Newline -> Text -> IO ()
 putStrLnWith eol s = hPutStrLnWith eol stdout s
 
-putStrLn :: String -> IO ()
+putStrLn :: Text -> IO ()
 putStrLn = putStrLnWith nativeNewline
 
-hPutStrWith :: Newline -> Handle -> String -> IO ()
+hPutStrWith :: Newline -> Handle -> Text -> IO ()
 hPutStrWith eol h s =
   hSetNewlineMode h (NewlineMode eol eol) >>
-  hSetEncoding h utf8 >> IO.hPutStr h s
+  hSetEncoding h utf8 >> TIO.hPutStr h s
 
-hPutStr :: Handle -> String -> IO ()
+hPutStr :: Handle -> Text -> IO ()
 hPutStr = hPutStrWith nativeNewline
 
-hPutStrLnWith :: Newline -> Handle -> String -> IO ()
+hPutStrLnWith :: Newline -> Handle -> Text -> IO ()
 hPutStrLnWith eol h s =
   hSetNewlineMode h (NewlineMode eol eol) >>
-  hSetEncoding h utf8 >> IO.hPutStrLn h s
+  hSetEncoding h utf8 >> TIO.hPutStrLn h s
 
-hPutStrLn :: Handle -> String -> IO ()
+hPutStrLn :: Handle -> Text -> IO ()
 hPutStrLn = hPutStrLnWith nativeNewline
 
-hGetContents :: Handle -> IO String
-hGetContents = fmap toString . B.hGetContents
--- hGetContents h = hSetEncoding h utf8_bom
---                   >> hSetNewlineMode h universalNewlineMode
---                   >> IO.hGetContents h
+hGetContents :: Handle -> IO Text
+hGetContents = fmap toText . B.hGetContents
 
 -- | Convert UTF8-encoded ByteString to Text, also
 -- removing '\r' characters.
-toText :: B.ByteString -> T.Text
+toText :: B.ByteString -> Text
 toText = T.decodeUtf8 . filterCRs . dropBOM
   where dropBOM bs =
          if "\xEF\xBB\xBF" `B.isPrefixOf` bs
@@ -127,7 +125,7 @@ toTextLazy = TL.decodeUtf8 . filterCRs . dropBOM
 toStringLazy :: BL.ByteString -> String
 toStringLazy = TL.unpack . toTextLazy
 
-fromText :: T.Text -> B.ByteString
+fromText :: Text -> B.ByteString
 fromText = T.encodeUtf8
 
 fromTextLazy :: TL.Text -> BL.ByteString
diff --git a/test/Tests/Command.hs b/test/Tests/Command.hs
index 07d825f73..59b04eac1 100644
--- a/test/Tests/Command.hs
+++ b/test/Tests/Command.hs
@@ -130,7 +130,7 @@ runCommandTest testExePath fp num code =
     let cmdline = "% " <> cmd
     let x = cmdline <> "\n" <> input <> "^D\n" <> norm
     let y = cmdline <> "\n" <> input <> "^D\n" <> newnorm
-    let updated = T.unpack $ T.replace (T.pack x) (T.pack y) (T.pack raw)
+    let updated = T.replace (T.pack x) (T.pack y) raw
     UTF8.writeFile fp' updated
 
 extractCommandTest :: FilePath -> FilePath -> TestTree
diff --git a/test/Tests/Old.hs b/test/Tests/Old.hs
index 17ece49fd..160086be2 100644
--- a/test/Tests/Old.hs
+++ b/test/Tests/Old.hs
@@ -22,6 +22,7 @@ import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.Golden.Advanced (goldenTest)
 import Tests.Helpers hiding (test)
 import qualified Text.Pandoc.UTF8 as UTF8
+import qualified Data.Text as T
 
 tests :: FilePath -> [TestTree]
 tests pandocPath =
@@ -231,7 +232,7 @@ tests pandocPath =
 -- makes sure file is fully closed after reading
 readFile' :: FilePath -> IO String
 readFile' f = do s <- UTF8.readFile f
-                 return $! (length s `seq` s)
+                 return $! (T.length s `seq` T.unpack s)
 
 lhsWriterTests :: FilePath -> String -> [TestTree]
 lhsWriterTests pandocPath format
@@ -333,7 +334,7 @@ testWithNormalize normalizer pandocPath testname opts inp norm =
                              $ UTF8.toStringLazy out
                    -- filter \r so the tests will work on Windows machines
                  else fail $ "Pandoc failed with error code " ++ show ec
-        updateGolden = UTF8.writeFile norm
+        updateGolden = UTF8.writeFile norm . T.pack
         options = ["--data-dir=../data","--quiet"] ++ [inp] ++ opts
 
 compareValues :: FilePath -> [String] -> String -> String -> IO (Maybe String)
diff --git a/test/Tests/Readers/LaTeX.hs b/test/Tests/Readers/LaTeX.hs
index 77104c853..8385b751e 100644
--- a/test/Tests/Readers/LaTeX.hs
+++ b/test/Tests/Readers/LaTeX.hs
@@ -54,7 +54,7 @@ tokUntokRt s = untokenize (tokenize "random" t) == t
 tests :: [TestTree]
 tests = [ testGroup "tokenization"
           [ testCase "tokenizer round trip on test case" $ do
-                 orig <- T.pack <$> UTF8.readFile "../test/latex-reader.latex"
+                 orig <- UTF8.readFile "../test/latex-reader.latex"
                  let new = untokenize $ tokenize "../test/latex-reader.latex"
                              orig
                  assertEqual "untokenize . tokenize is identity" orig new
-- 
cgit v1.2.3


From 24191a2a278c0dec30bacd66b78cbb8cc8d91324 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Mon, 15 Mar 2021 10:37:35 -0700
Subject: Use foldl' instead of foldl everywhere.

---
 src/Text/Pandoc/App/CommandLineOptions.hs           | 4 ++--
 src/Text/Pandoc/Citeproc/Locator.hs                 | 3 ++-
 src/Text/Pandoc/Class/PandocMonad.hs                | 3 ++-
 src/Text/Pandoc/Extensions.hs                       | 3 ++-
 src/Text/Pandoc/Lua/Filter.hs                       | 3 ++-
 src/Text/Pandoc/Readers/Docx/Combine.hs             | 4 ++--
 src/Text/Pandoc/Readers/HTML/TagCategories.hs       | 1 +
 src/Text/Pandoc/Readers/Markdown.hs                 | 6 +++---
 src/Text/Pandoc/Readers/Odt/Arrows/State.hs         | 4 ++--
 src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs | 4 ++--
 src/Text/Pandoc/Readers/Odt/StyleReader.hs          | 4 ++--
 src/Text/Pandoc/Readers/Textile.hs                  | 4 ++--
 src/Text/Pandoc/Shared.hs                           | 4 ++--
 src/Text/Pandoc/Writers/HTML.hs                     | 8 ++++----
 src/Text/Pandoc/Writers/RST.hs                      | 4 ++--
 src/Text/Pandoc/Writers/Texinfo.hs                  | 4 ++--
 16 files changed, 34 insertions(+), 29 deletions(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/src/Text/Pandoc/App/CommandLineOptions.hs b/src/Text/Pandoc/App/CommandLineOptions.hs
index b4483f756..a6df12715 100644
--- a/src/Text/Pandoc/App/CommandLineOptions.hs
+++ b/src/Text/Pandoc/App/CommandLineOptions.hs
@@ -31,7 +31,7 @@ import Data.Aeson.Encode.Pretty (encodePretty', Config(..), keyOrder,
          defConfig, Indent(..), NumberFormat(..))
 import Data.Bifunctor (second)
 import Data.Char (toLower)
-import Data.List (intercalate, sort)
+import Data.List (intercalate, sort, foldl')
 #ifdef _WINDOWS
 #if MIN_VERSION_base(4,12,0)
 import Data.List (isPrefixOf)
@@ -93,7 +93,7 @@ parseOptionsFromArgs options' defaults prg rawArgs = do
         ("Try " ++ prg ++ " --help for more information.")
 
   -- thread option data structure through all supplied option actions
-  opts <- foldl (>>=) (return defaults) actions
+  opts <- foldl' (>>=) (return defaults) actions
   let mbArgs = case args of
                  [] -> Nothing
                  xs -> Just xs
diff --git a/src/Text/Pandoc/Citeproc/Locator.hs b/src/Text/Pandoc/Citeproc/Locator.hs
index dba762c02..44416ca12 100644
--- a/src/Text/Pandoc/Citeproc/Locator.hs
+++ b/src/Text/Pandoc/Citeproc/Locator.hs
@@ -7,6 +7,7 @@ where
 import Citeproc.Types
 import Data.Text (Text)
 import qualified Data.Text as T
+import Data.List (foldl')
 import Text.Parsec
 import Text.Pandoc.Definition
 import Text.Pandoc.Parsing (romanNumeral)
@@ -139,7 +140,7 @@ pBalancedBraces braces p = try $ do
   where
       except = notFollowedBy pBraces >> p
       -- outer and inner
-      surround = foldl (\a (open, close) -> sur open close except <|> a)
+      surround = foldl' (\a (open, close) -> sur open close except <|> a)
                        except
                        braces
 
diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index 86c8de79e..293a822a0 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -59,6 +59,7 @@ import Control.Monad.Except (MonadError (catchError, throwError),
                              MonadTrans, lift, when)
 import Data.Digest.Pure.SHA (sha1, showDigest)
 import Data.Maybe (fromMaybe)
+import Data.List (foldl')
 import Data.Time (UTCTime)
 import Data.Time.Clock.POSIX (POSIXTime, utcTimeToPOSIXSeconds,
                              posixSecondsToUTCTime)
@@ -612,7 +613,7 @@ checkExistence fn = do
 -- | Canonicalizes a file path by removing redundant @.@ and @..@.
 makeCanonical :: FilePath -> FilePath
 makeCanonical = Posix.joinPath . transformPathParts . splitDirectories
- where  transformPathParts = reverse . foldl go []
+ where  transformPathParts = reverse . foldl' go []
         go as     "."  = as
         go (_:as) ".." = as
         go as     x    = x : as
diff --git a/src/Text/Pandoc/Extensions.hs b/src/Text/Pandoc/Extensions.hs
index 3b96f9e04..266a09e3c 100644
--- a/src/Text/Pandoc/Extensions.hs
+++ b/src/Text/Pandoc/Extensions.hs
@@ -34,6 +34,7 @@ module Text.Pandoc.Extensions ( Extension(..)
 where
 import Data.Bits (clearBit, setBit, testBit, (.|.))
 import Data.Data (Data)
+import Data.List (foldl')
 import qualified Data.Text as T
 import Data.Typeable (Typeable)
 import GHC.Generics (Generic)
@@ -593,7 +594,7 @@ parseFormatSpec :: T.Text
 parseFormatSpec = parse formatSpec ""
   where formatSpec = do
           name <- formatName
-          (extsToEnable, extsToDisable) <- foldl (flip ($)) ([],[]) <$>
+          (extsToEnable, extsToDisable) <- foldl' (flip ($)) ([],[]) <$>
                                              many extMod
           return (T.pack name, reverse extsToEnable, reverse extsToDisable)
         formatName = many1 $ noneOf "-+"
diff --git a/src/Text/Pandoc/Lua/Filter.hs b/src/Text/Pandoc/Lua/Filter.hs
index bffe01a34..90967f295 100644
--- a/src/Text/Pandoc/Lua/Filter.hs
+++ b/src/Text/Pandoc/Lua/Filter.hs
@@ -22,6 +22,7 @@ import Control.Monad.Catch (finally, try)
 import Data.Data (Data, DataType, dataTypeConstrs, dataTypeName, dataTypeOf,
                   showConstr, toConstr, tyconUQname)
 import Data.Foldable (foldrM)
+import Data.List (foldl')
 import Data.Map (Map)
 import Data.Maybe (fromMaybe)
 import Foreign.Lua (Lua, Peekable, Pushable, StackIndex)
@@ -204,7 +205,7 @@ walkMeta lf (Pandoc m bs) = do
 
 walkPandoc :: LuaFilter -> Pandoc -> Lua Pandoc
 walkPandoc (LuaFilter fnMap) =
-  case foldl mplus Nothing (map (`Map.lookup` fnMap) pandocFilterNames) of
+  case foldl' mplus Nothing (map (`Map.lookup` fnMap) pandocFilterNames) of
     Just fn -> \x -> runFilterFunction fn x *> singleElement x
     Nothing -> return
 
diff --git a/src/Text/Pandoc/Readers/Docx/Combine.hs b/src/Text/Pandoc/Readers/Docx/Combine.hs
index bcf26c4a3..7c6d01769 100644
--- a/src/Text/Pandoc/Readers/Docx/Combine.hs
+++ b/src/Text/Pandoc/Readers/Docx/Combine.hs
@@ -182,7 +182,7 @@ isAttrModifier _                  = False
 
 smushInlines :: [Inlines] -> Inlines
 smushInlines xs = combineInlines xs' mempty
-  where xs' = foldl combineInlines mempty xs
+  where xs' = foldl' combineInlines mempty xs
 
 smushBlocks :: [Blocks] -> Blocks
-smushBlocks xs = foldl combineBlocks mempty xs
+smushBlocks xs = foldl' combineBlocks mempty xs
diff --git a/src/Text/Pandoc/Readers/HTML/TagCategories.hs b/src/Text/Pandoc/Readers/HTML/TagCategories.hs
index b7bd40fee..67aba1cb1 100644
--- a/src/Text/Pandoc/Readers/HTML/TagCategories.hs
+++ b/src/Text/Pandoc/Readers/HTML/TagCategories.hs
@@ -23,6 +23,7 @@ where
 
 import Data.Set (Set, fromList, unions)
 import Data.Text (Text)
+import Data.List (foldl')
 
 eitherBlockOrInline :: Set Text
 eitherBlockOrInline = fromList
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 34edbcc17..dc94fc2d6 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -21,7 +21,7 @@ module Text.Pandoc.Readers.Markdown (
 import Control.Monad
 import Control.Monad.Except (throwError)
 import Data.Char (isAlphaNum, isPunctuation, isSpace)
-import Data.List (transpose, elemIndex, sortOn)
+import Data.List (transpose, elemIndex, sortOn, foldl')
 import qualified Data.Map as M
 import Data.Maybe
 import qualified Data.Set as Set
@@ -357,7 +357,7 @@ referenceKey = try $ do
   addKvs <- option [] $ guardEnabled Ext_mmd_link_attributes
                           >> many (try $ spnl >> keyValAttr)
   blanklines
-  let attr'  = extractIdClass $ foldl (\x f -> f x) attr addKvs
+  let attr'  = extractIdClass $ foldl' (\x f -> f x) attr addKvs
       target = (escapeURI $ trimr src, tit)
   st <- getState
   let oldkeys = stateKeys st
@@ -613,7 +613,7 @@ attributes = try $ do
   spnl
   attrs <- many (attribute <* spnl)
   char '}'
-  return $ foldl (\x f -> f x) nullAttr attrs
+  return $ foldl' (\x f -> f x) nullAttr attrs
 
 attribute :: PandocMonad m => MarkdownParser m (Attr -> Attr)
 attribute = identifierAttr <|> classAttr <|> keyValAttr <|> specialAttr
diff --git a/src/Text/Pandoc/Readers/Odt/Arrows/State.hs b/src/Text/Pandoc/Readers/Odt/Arrows/State.hs
index 93c6b5e79..96515bf56 100644
--- a/src/Text/Pandoc/Readers/Odt/Arrows/State.hs
+++ b/src/Text/Pandoc/Readers/Odt/Arrows/State.hs
@@ -22,7 +22,7 @@ module Text.Pandoc.Readers.Odt.Arrows.State where
 import Control.Arrow
 import qualified Control.Category as Cat
 import Control.Monad
-
+import Data.List (foldl')
 import Text.Pandoc.Readers.Odt.Arrows.Utils
 import Text.Pandoc.Readers.Odt.Generic.Fallible
 
@@ -122,7 +122,7 @@ iterateS a = ArrowState $ \(s,f) -> foldr a' (s,mzero) f
 iterateSL :: (Foldable f, MonadPlus m)
           => ArrowState s    x     y
           -> ArrowState s (f x) (m y)
-iterateSL a = ArrowState $ \(s,f) -> foldl a' (s,mzero) f
+iterateSL a = ArrowState $ \(s,f) -> foldl' a' (s,mzero) f
   where a' (s',m) x = second (mplus m.return) $ runArrowState a (s',x)
 
 
diff --git a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs
index 0d921e23b..341903046 100644
--- a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs
+++ b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs
@@ -64,12 +64,12 @@ import qualified Data.Map             as M
 import           Data.Text (Text)
 import           Data.Default
 import           Data.Maybe
+import           Data.List (foldl')
 
 import qualified Text.Pandoc.XML.Light as XML
 
 import           Text.Pandoc.Readers.Odt.Arrows.State
 import           Text.Pandoc.Readers.Odt.Arrows.Utils
-
 import           Text.Pandoc.Readers.Odt.Generic.Namespaces
 import           Text.Pandoc.Readers.Odt.Generic.Utils
 import           Text.Pandoc.Readers.Odt.Generic.Fallible
@@ -293,7 +293,7 @@ readNSattributes         = fromState $ \state -> maybe (state, failEmpty     )
                          => XMLConverterState nsID extraState
                          -> Maybe (XMLConverterState nsID extraState)
     extractNSAttrs startState
-                         = foldl (\state d -> state >>= addNS d)
+                         = foldl' (\state d -> state >>= addNS d)
                                  (Just startState)
                                  nsAttribs
       where nsAttribs    = mapMaybe readNSattr (XML.elAttribs element)
diff --git a/src/Text/Pandoc/Readers/Odt/StyleReader.hs b/src/Text/Pandoc/Readers/Odt/StyleReader.hs
index 5e10f896c..b722aa07d 100644
--- a/src/Text/Pandoc/Readers/Odt/StyleReader.hs
+++ b/src/Text/Pandoc/Readers/Odt/StyleReader.hs
@@ -44,7 +44,7 @@ import Control.Arrow
 
 import Data.Default
 import qualified Data.Foldable as F
-import Data.List (unfoldr)
+import Data.List (unfoldr, foldl')
 import qualified Data.Map as M
 import Data.Maybe
 import Data.Text (Text)
@@ -120,7 +120,7 @@ fontPitchReader = executeInSub NsOffice "font-face-decls" (
                               &&&
                               lookupDefaultingAttr NsStyle "font-pitch"
                             ))
-                    >>?^ ( M.fromList . foldl accumLegalPitches [] )
+                    >>?^ ( M.fromList . foldl' accumLegalPitches [] )
                   ) `ifFailedDo` returnV (Right M.empty)
   where accumLegalPitches ls (Nothing,_) = ls
         accumLegalPitches ls (Just n,p)  = (n,p):ls
diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs
index 860da2dc3..99238c7f0 100644
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@@ -38,7 +38,7 @@ module Text.Pandoc.Readers.Textile ( readTextile) where
 import Control.Monad (guard, liftM)
 import Control.Monad.Except (throwError)
 import Data.Char (digitToInt, isUpper)
-import Data.List (intersperse, transpose)
+import Data.List (intersperse, transpose, foldl')
 import Data.Text (Text)
 import qualified Data.Text as T
 import Text.HTML.TagSoup (Tag (..), fromAttrib)
@@ -627,7 +627,7 @@ code2 = do
 
 -- | Html / CSS attributes
 attributes :: PandocMonad m => ParserT Text ParserState m Attr
-attributes = foldl (flip ($)) ("",[],[]) <$>
+attributes = foldl' (flip ($)) ("",[],[]) <$>
   try (do special <- option id specialAttribute
           attrs <- many attribute
           return (special : attrs))
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index d11ad13f5..0ce9396b3 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -109,7 +109,7 @@ import qualified Data.Bifunctor as Bifunctor
 import Data.Char (isAlpha, isLower, isSpace, isUpper, toLower, isAlphaNum,
                   generalCategory, GeneralCategory(NonSpacingMark,
                   SpacingCombiningMark, EnclosingMark, ConnectorPunctuation))
-import Data.List (find, intercalate, intersperse, sortOn)
+import Data.List (find, intercalate, intersperse, sortOn, foldl')
 import qualified Data.Map as M
 import Data.Maybe (mapMaybe, fromMaybe)
 import Data.Monoid (Any (..))
@@ -840,7 +840,7 @@ mapLeft = Bifunctor.first
 -- > collapseFilePath "parent/foo/.." ==  "parent"
 -- > collapseFilePath "/parent/foo/../../bar" ==  "/bar"
 collapseFilePath :: FilePath -> FilePath
-collapseFilePath = Posix.joinPath . reverse . foldl go [] . splitDirectories
+collapseFilePath = Posix.joinPath . reverse . foldl' go [] . splitDirectories
   where
     go rs "." = rs
     go r@(p:rs) ".." = case p of
diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs
index 2f33cd467..332de1545 100644
--- a/src/Text/Pandoc/Writers/HTML.hs
+++ b/src/Text/Pandoc/Writers/HTML.hs
@@ -30,7 +30,7 @@ module Text.Pandoc.Writers.HTML (
   ) where
 import Control.Monad.State.Strict
 import Data.Char (ord)
-import Data.List (intercalate, intersperse, partition, delete, (\\))
+import Data.List (intercalate, intersperse, partition, delete, (\\), foldl')
 import Data.List.NonEmpty (NonEmpty((:|)))
 import Data.Maybe (fromMaybe, isJust, isNothing)
 import qualified Data.Set as Set
@@ -544,7 +544,7 @@ tagWithAttributes opts html5 selfClosing tagname attr =
 
 addAttrs :: PandocMonad m
          => WriterOptions -> Attr -> Html -> StateT WriterState m Html
-addAttrs opts attr h = foldl (!) h <$> attrsToHtml opts attr
+addAttrs opts attr h = foldl' (!) h <$> attrsToHtml opts attr
 
 toAttrs :: PandocMonad m
         => [(Text, Text)] -> StateT WriterState m [Attribute]
@@ -926,7 +926,7 @@ blockToHtml opts (OrderedList (startnum, numstyle, _) lst) = do
                                    numstyle']
                    else [])
   l <- ordList opts contents
-  return $ foldl (!) l attribs
+  return $ foldl' (!) l attribs
 blockToHtml opts (DefinitionList lst) = do
   contents <- mapM (\(term, defs) ->
                   do term' <- liftM H.dt $ inlineListToHtml opts term
@@ -1407,7 +1407,7 @@ inlineToHtml opts inline = do
                               Just "audio" -> mediaTag H5.audio "Audio"
                               Just _       -> (H5.embed, [])
                               _            -> imageTag
-                        return $ foldl (!) tag $ attributes ++ specAttrs
+                        return $ foldl' (!) tag $ attributes ++ specAttrs
                         -- note:  null title included, as in Markdown.pl
     (Note contents) -> do
                         notes <- gets stNotes
diff --git a/src/Text/Pandoc/Writers/RST.hs b/src/Text/Pandoc/Writers/RST.hs
index d01e13db4..54d042332 100644
--- a/src/Text/Pandoc/Writers/RST.hs
+++ b/src/Text/Pandoc/Writers/RST.hs
@@ -16,7 +16,7 @@ reStructuredText:  <http://docutils.sourceforge.net/rst.html>
 module Text.Pandoc.Writers.RST ( writeRST, flatten ) where
 import Control.Monad.State.Strict
 import Data.Char (isSpace)
-import Data.List (transpose, intersperse)
+import Data.List (transpose, intersperse, foldl')
 import Data.Maybe (fromMaybe)
 import qualified Data.Text as T
 import Data.Text (Text)
@@ -509,7 +509,7 @@ flatten outer
   | null contents = [outer]
   | otherwise     = combineAll contents
   where contents = dropInlineParent outer
-        combineAll = foldl combine []
+        combineAll = foldl' combine []
 
         combine :: [Inline] -> Inline -> [Inline]
         combine f i =
diff --git a/src/Text/Pandoc/Writers/Texinfo.hs b/src/Text/Pandoc/Writers/Texinfo.hs
index 53da70f84..9d695563f 100644
--- a/src/Text/Pandoc/Writers/Texinfo.hs
+++ b/src/Text/Pandoc/Writers/Texinfo.hs
@@ -15,7 +15,7 @@ module Text.Pandoc.Writers.Texinfo ( writeTexinfo ) where
 import Control.Monad.Except (throwError)
 import Control.Monad.State.Strict
 import Data.Char (chr, ord)
-import Data.List (maximumBy, transpose)
+import Data.List (maximumBy, transpose, foldl')
 import Data.Ord (comparing)
 import qualified Data.Set as Set
 import Data.Text (Text)
@@ -271,7 +271,7 @@ tableAnyRowToTexinfo :: PandocMonad m
                      -> [[Block]]
                      -> TI m (Doc Text)
 tableAnyRowToTexinfo itemtype aligns cols =
-  (literal itemtype $$) . foldl (\row item -> row $$
+  (literal itemtype $$) . foldl' (\row item -> row $$
   (if isEmpty row then empty else text " @tab ") <> item) empty <$> zipWithM alignedBlock aligns cols
 
 alignedBlock :: PandocMonad m
-- 
cgit v1.2.3


From aecbf8156eb7c36c4b41de27797e262c23728db5 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Sun, 11 Apr 2021 21:28:48 -0700
Subject: Remove Text.Pandoc.BCP47 module.

[API change]

Use Lang from UnicodeCollation.Lang instead.
This is a richer implementation of BCP 47.
---
 pandoc.cabal                            |   1 -
 src/Text/Pandoc/App.hs                  |   6 +-
 src/Text/Pandoc/BCP47.hs                |  99 -------------
 src/Text/Pandoc/Citeproc.hs             |  10 +-
 src/Text/Pandoc/Citeproc/BibTeX.hs      |  20 +--
 src/Text/Pandoc/Citeproc/Data.hs        |  12 +-
 src/Text/Pandoc/Class/CommonState.hs    |   2 +-
 src/Text/Pandoc/Class/PandocMonad.hs    |   4 +-
 src/Text/Pandoc/Readers/BibTeX.hs       |   9 +-
 src/Text/Pandoc/Readers/LaTeX.hs        |   2 +-
 src/Text/Pandoc/Readers/LaTeX/Lang.hs   | 241 ++++++++++++++++----------------
 src/Text/Pandoc/Writers/BibTeX.hs       |   2 +-
 src/Text/Pandoc/Writers/ConTeXt.hs      |  48 +++----
 src/Text/Pandoc/Writers/CslJson.hs      |   7 +-
 src/Text/Pandoc/Writers/Docx.hs         |   2 +-
 src/Text/Pandoc/Writers/LaTeX.hs        |   2 +-
 src/Text/Pandoc/Writers/LaTeX/Lang.hs   |   8 +-
 src/Text/Pandoc/Writers/ODT.hs          |   7 +-
 src/Text/Pandoc/Writers/OpenDocument.hs |   8 +-
 src/Text/Pandoc/Writers/Shared.hs       |   2 +-
 20 files changed, 198 insertions(+), 294 deletions(-)
 delete mode 100644 src/Text/Pandoc/BCP47.hs

(limited to 'src/Text/Pandoc/Class')

diff --git a/pandoc.cabal b/pandoc.cabal
index b6cbb0d7a..8816767e9 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -601,7 +601,6 @@ library
                    Text.Pandoc.Asciify,
                    Text.Pandoc.Emoji,
                    Text.Pandoc.ImageSize,
-                   Text.Pandoc.BCP47,
                    Text.Pandoc.Class,
                    Text.Pandoc.Citeproc
   other-modules:   Text.Pandoc.App.CommandLineOptions,
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index 6b45e5418..67d3cce7d 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -55,7 +55,7 @@ import Text.Pandoc.App.Opt (Opt (..), LineEnding (..), defaultOpts,
 import Text.Pandoc.App.CommandLineOptions (parseOptions, parseOptionsFromArgs,
                                            options)
 import Text.Pandoc.App.OutputSettings (OutputSettings (..), optToOutputSettings)
-import Text.Pandoc.BCP47 (Lang (..), parseBCP47)
+import UnicodeCollation.Lang (Lang (..), parseLang)
 import Text.Pandoc.Filter (Filter (JSONFilter, LuaFilter), applyFilters)
 import Text.Pandoc.PDF (makePDF)
 import Text.Pandoc.SelfContained (makeSelfContained)
@@ -200,8 +200,8 @@ convertWithOpts opts = do
                     Just f  -> readFileStrict f
 
     case lookupMetaString "lang" (optMetadata opts) of
-           ""      -> setTranslations $ Lang "en" "" "US" []
-           l       -> case parseBCP47 l of
+           ""      -> setTranslations $ Lang "en" Nothing (Just "US") [] [] []
+           l       -> case parseLang l of
                            Left _   -> report $ InvalidLang l
                            Right l' -> setTranslations l'
 
diff --git a/src/Text/Pandoc/BCP47.hs b/src/Text/Pandoc/BCP47.hs
deleted file mode 100644
index 1ecf0bf73..000000000
--- a/src/Text/Pandoc/BCP47.hs
+++ /dev/null
@@ -1,99 +0,0 @@
-{-# LANGUAGE OverloadedStrings #-}
-{- |
-   Module      : Text.Pandoc.BCP47
-   Copyright   : Copyright (C) 2017-2021 John MacFarlane
-   License     : GNU GPL, version 2 or above
-
-   Maintainer  : John MacFarlane <jgm@berkeley.edu>
-   Stability   : alpha
-   Portability : portable
-
-Functions for parsing and rendering BCP47 language identifiers.
--}
-module Text.Pandoc.BCP47 (
-                       getLang
-                     , parseBCP47
-                     , Lang(..)
-                     , renderLang
-                     )
-where
-import Control.Monad (guard)
-import Data.Char (isAlphaNum, isAscii, isLetter, isLower, isUpper)
-import Text.Pandoc.Definition
-import Text.Pandoc.Options
-import Text.DocTemplates (FromContext(..))
-import qualified Data.Text as T
-import qualified Text.Parsec as P
-
--- | Represents BCP 47 language/country code.
-data Lang = Lang{ langLanguage :: T.Text
-                , langScript   :: T.Text
-                , langRegion   :: T.Text
-                , langVariants :: [T.Text] }
-                deriving (Eq, Ord, Show)
-
--- | Render a Lang as BCP 47.
-renderLang :: Lang -> T.Text
-renderLang lang = T.intercalate "-" (langLanguage lang : filter (not . T.null)
-                    ([langScript lang, langRegion lang] ++ langVariants lang))
-
--- | Parse a BCP 47 string as a Lang.  Currently we parse
--- extensions and private-use fields as "variants," even
--- though officially they aren't.
-parseBCP47 :: T.Text -> Either T.Text Lang
-parseBCP47 lang =
-  case P.parse bcp47 "lang" lang of
-       Right r -> Right r
-       Left e  -> Left $ T.pack $ show e
-  where bcp47 = do
-          language <- pLanguage
-          script <- P.option "" pScript
-          region <- P.option "" pRegion
-          variants <- P.many (pVariant P.<|> pExtension P.<|> pPrivateUse)
-          P.eof
-          return Lang{   langLanguage = language
-                       , langScript = script
-                       , langRegion = region
-                       , langVariants = variants }
-        asciiLetter = P.satisfy (\c -> isAscii c && isLetter c)
-        pLanguage = do
-          cs <- P.many1 asciiLetter
-          let lcs = length cs
-          guard $ lcs == 2 || lcs == 3
-          return $ T.toLower $ T.pack cs
-        pScript = P.try $ do
-          P.char '-'
-          x <- P.satisfy (\c -> isAscii c && isLetter c && isUpper c)
-          xs <- P.count 3
-                 (P.satisfy (\c -> isAscii c && isLetter c && isLower c))
-          return $ T.toLower $ T.pack (x:xs)
-        pRegion = P.try $ do
-          P.char '-'
-          cs <- P.many1 asciiLetter
-          let lcs = length cs
-          guard $ lcs == 2 || lcs == 3
-          return $ T.toUpper $ T.pack cs
-        pVariant = P.try $ do
-          P.char '-'
-          ds <- P.option "" (P.count 1 P.digit)
-          cs <- P.many1 asciiLetter
-          let var = ds ++ cs
-              lv = length var
-          guard $ if null ds
-                     then lv >= 5 && lv <= 8
-                     else lv == 4
-          return $ T.toLower $ T.pack var
-        pExtension = P.try $ do
-          P.char '-'
-          cs <- P.many1 $ P.satisfy (\c -> isAscii c && isAlphaNum c)
-          let lcs = length cs
-          guard $ lcs >= 2 && lcs <= 8
-          return $ T.toLower $ T.pack cs
-        pPrivateUse = P.try $ do
-          P.char '-'
-          P.char 'x'
-          P.char '-'
-          cs <- P.many1 $ P.satisfy (\c -> isAscii c && isAlphaNum c)
-          guard $ not (null cs) && length cs <= 8
-          let var = "x-" ++ cs
-          return $ T.toLower $ T.pack var
diff --git a/src/Text/Pandoc/Citeproc.hs b/src/Text/Pandoc/Citeproc.hs
index af302f782..c9f1806e4 100644
--- a/src/Text/Pandoc/Citeproc.hs
+++ b/src/Text/Pandoc/Citeproc.hs
@@ -18,7 +18,6 @@ import Text.Pandoc.Citeproc.CslJson (cslJsonToReferences)
 import Text.Pandoc.Citeproc.BibTeX (readBibtexString, Variant(..))
 import Text.Pandoc.Citeproc.MetaValue (metaValueToReference, metaValueToText)
 import Text.Pandoc.Readers.Markdown (yamlToRefs)
-import qualified Text.Pandoc.BCP47 as BCP47
 import Text.Pandoc.Builder (Inlines, Many(..), deleteMeta, setMeta)
 import qualified Text.Pandoc.Builder as B
 import Text.Pandoc.Definition as Pandoc
@@ -630,13 +629,8 @@ removeFinalPeriod ils =
 
 bcp47LangToIETF :: PandocMonad m => Text -> m (Maybe Lang)
 bcp47LangToIETF bcplang =
-  case BCP47.parseBCP47 bcplang of
+  case parseLang bcplang of
     Left _ -> do
       report $ InvalidLang bcplang
       return Nothing
-    Right lang ->
-      return $ Just
-             $ Lang (BCP47.langLanguage lang)
-                    (if T.null (BCP47.langRegion lang)
-                        then Nothing
-                        else Just (BCP47.langRegion lang))
+    Right lang -> return $ Just lang
diff --git a/src/Text/Pandoc/Citeproc/BibTeX.hs b/src/Text/Pandoc/Citeproc/BibTeX.hs
index c0752dadc..510e56f9c 100644
--- a/src/Text/Pandoc/Citeproc/BibTeX.hs
+++ b/src/Text/Pandoc/Citeproc/BibTeX.hs
@@ -205,10 +205,13 @@ writeBibtexString opts variant mblang ref =
                    [ (", " <>) <$> nameGiven name,
                      nameDroppingParticle name ]
 
-  mblang' = (parseLang <$> getVariableAsText "language") <|> mblang
+  mblang' = case getVariableAsText "language" of
+              Just l  -> either (const Nothing) Just $ parseLang l
+              Nothing -> mblang
 
   titlecase = case mblang' of
-                Just (Lang "en" _) -> titlecase'
+                Just lang | langLanguage lang == "en"
+                                   -> titlecase'
                 Nothing            -> titlecase'
                 _                  ->
                   case variant of
@@ -331,7 +334,7 @@ writeBibtexString opts variant mblang ref =
   renderFields = mconcat . intersperse ("," <> cr) . mapMaybe renderField
 
 defaultLang :: Lang
-defaultLang = Lang "en" (Just "US")
+defaultLang = Lang "en" Nothing (Just "US") [] [] []
 
 -- a map of bibtex "string" macros
 type StringMap = Map.Map Text Text
@@ -351,9 +354,7 @@ itemToReference locale variant item = do
   bib item $ do
     let lang = fromMaybe defaultLang $ localeLanguage locale
     modify $ \st -> st{ localeLang = lang,
-                        untitlecase = case lang of
-                                           (Lang "en" _) -> True
-                                           _             -> False }
+                        untitlecase = langLanguage lang == "en" }
 
     id' <- asks identifier
     otherIds <- (Just <$> getRawField "ids")
@@ -711,7 +712,7 @@ itemToReference locale variant item = do
 
 
 bib :: Item -> Bib a -> BibParser a
-bib entry m = fst <$> evalRWST m entry (BibState True (Lang "en" (Just "US")))
+bib entry m = fst <$> evalRWST m entry (BibState True defaultLang)
 
 resolveCrossRefs :: Variant -> [Item] -> [Item]
 resolveCrossRefs variant entries =
@@ -1456,8 +1457,9 @@ resolveKey lang ils = Walk.walk go ils
         go x       = x
 
 resolveKey' :: Lang -> Text -> Text
-resolveKey' lang@(Lang l _) k =
-  case Map.lookup l biblatexStringMap >>= Map.lookup (T.toLower k) of
+resolveKey' lang k =
+  case Map.lookup (langLanguage lang) biblatexStringMap >>=
+        Map.lookup (T.toLower k) of
     Nothing     -> k
     Just (x, _) -> either (const k) stringify $ parseLaTeX lang x
 
diff --git a/src/Text/Pandoc/Citeproc/Data.hs b/src/Text/Pandoc/Citeproc/Data.hs
index 40430b0f5..388b9ba62 100644
--- a/src/Text/Pandoc/Citeproc/Data.hs
+++ b/src/Text/Pandoc/Citeproc/Data.hs
@@ -21,12 +21,12 @@ biblatexStringMap :: M.Map Text (M.Map Text (Text, Text))
 biblatexStringMap = foldr go mempty biblatexLocalizations
  where
   go (fp, bs) =
-    let Lang lang _ _ _ _ _ = parseLang
-                                (toIETF $ T.takeWhile (/= '.') $ T.pack fp)
-        ls = T.lines $ TE.decodeUtf8 bs
-     in if length ls > 4
-           then M.insert lang (toStringMap $ map (T.splitOn "|") ls)
-           else id
+    let ls = T.lines $ TE.decodeUtf8 bs
+     in case parseLang (toIETF $ T.takeWhile (/= '.') $ T.pack fp) of
+          Right lang | length ls > 4
+            -> M.insert (langLanguage lang)
+                        (toStringMap $ map (T.splitOn "|") ls)
+          _ -> id
   toStringMap = foldr go' mempty
   go' [term, x, y] = M.insert term (x, y)
   go' _ = id
diff --git a/src/Text/Pandoc/Class/CommonState.hs b/src/Text/Pandoc/Class/CommonState.hs
index 7e1735c2b..0fd094d99 100644
--- a/src/Text/Pandoc/Class/CommonState.hs
+++ b/src/Text/Pandoc/Class/CommonState.hs
@@ -19,7 +19,7 @@ where
 
 import Data.Default (Default (def))
 import Data.Text (Text)
-import Text.Pandoc.BCP47 (Lang)
+import UnicodeCollation.Lang (Lang)
 import Text.Pandoc.MediaBag (MediaBag)
 import Text.Pandoc.Logging (LogMessage, Verbosity (WARNING))
 import Text.Pandoc.Translations (Translations)
diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index 293a822a0..76f1fa32b 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -70,7 +70,7 @@ import Network.URI ( escapeURIString, nonStrictRelativeTo,
 import System.FilePath ((</>), (<.>), takeExtension, dropExtension,
                         isRelative, splitDirectories)
 import System.Random (StdGen)
-import Text.Pandoc.BCP47 (Lang(..), parseBCP47, renderLang)
+import UnicodeCollation.Lang (Lang(..), parseLang, renderLang)
 import Text.Pandoc.Class.CommonState (CommonState (..))
 import Text.Pandoc.Definition
 import Text.Pandoc.Error
@@ -285,7 +285,7 @@ readFileFromDirs (d:ds) f = catchError
 toLang :: PandocMonad m => Maybe T.Text -> m (Maybe Lang)
 toLang Nothing = return Nothing
 toLang (Just s) =
-  case parseBCP47 s of
+  case parseLang s of
        Left _ -> do
          report $ InvalidLang s
          return Nothing
diff --git a/src/Text/Pandoc/Readers/BibTeX.hs b/src/Text/Pandoc/Readers/BibTeX.hs
index 956b9f1f7..b82a81350 100644
--- a/src/Text/Pandoc/Readers/BibTeX.hs
+++ b/src/Text/Pandoc/Readers/BibTeX.hs
@@ -48,11 +48,14 @@ readBibLaTeX = readBibTeX' BibTeX.Biblatex
 
 readBibTeX' :: PandocMonad m => Variant -> ReaderOptions -> Text -> m Pandoc
 readBibTeX' variant _opts t = do
-  lang <- maybe (Lang "en" (Just "US")) parseLang
-             <$> lookupEnv "LANG"
+  mblangEnv <- lookupEnv "LANG"
+  let defaultLang = Lang "en" Nothing (Just "US") [] [] []
+  let lang = case mblangEnv of
+              Nothing  -> defaultLang
+              Just l   -> either (const defaultLang) id $ parseLang l
   locale <- case getLocale lang of
                Left e  ->
-                 case getLocale (Lang "en" (Just "US")) of
+                 case getLocale (Lang "en" Nothing (Just "US") [] [] []) of
                    Right l -> return l
                    Left _  -> throwError $ PandocCiteprocError e
                Right l -> return l
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index 851756065..83caf742a 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -33,7 +33,7 @@ import qualified Data.Set as Set
 import Data.Text (Text)
 import qualified Data.Text as T
 import System.FilePath (addExtension, replaceExtension, takeExtension)
-import Text.Pandoc.BCP47 (renderLang)
+import UnicodeCollation.Lang (renderLang)
 import Text.Pandoc.Builder as B
 import Text.Pandoc.Class.PandocPure (PandocPure)
 import Text.Pandoc.Class.PandocMonad (PandocMonad (..), getResourcePath,
diff --git a/src/Text/Pandoc/Readers/LaTeX/Lang.hs b/src/Text/Pandoc/Readers/LaTeX/Lang.hs
index 08e217bdb..b92e6ab57 100644
--- a/src/Text/Pandoc/Readers/LaTeX/Lang.hs
+++ b/src/Text/Pandoc/Readers/LaTeX/Lang.hs
@@ -23,7 +23,7 @@ import qualified Data.Map as M
 import Data.Text (Text)
 import qualified Data.Text as T
 import Text.Pandoc.Shared (extractSpaces)
-import Text.Pandoc.BCP47 (Lang(..), renderLang)
+import UnicodeCollation.Lang (Lang(..), renderLang)
 import Text.Pandoc.Class (PandocMonad(..), setTranslations)
 import Text.Pandoc.Readers.LaTeX.Parsing
 import Text.Pandoc.Parsing (updateState, option, getState, QuoteContext(..),
@@ -99,133 +99,136 @@ setDefaultLanguage = do
 polyglossiaLangToBCP47 :: M.Map T.Text (T.Text -> Lang)
 polyglossiaLangToBCP47 = M.fromList
   [ ("arabic", \o -> case T.filter (/=' ') o of
-       "locale=algeria"    -> Lang "ar" "" "DZ" []
-       "locale=mashriq"    -> Lang "ar" "" "SY" []
-       "locale=libya"      -> Lang "ar" "" "LY" []
-       "locale=morocco"    -> Lang "ar" "" "MA" []
-       "locale=mauritania" -> Lang "ar" "" "MR" []
-       "locale=tunisia"    -> Lang "ar" "" "TN" []
-       _                   -> Lang "ar" "" "" [])
+       "locale=algeria"    -> Lang "ar" Nothing (Just "DZ") [] [] []
+       "locale=mashriq"    -> Lang "ar" Nothing (Just "SY") [] [] []
+       "locale=libya"      -> Lang "ar" Nothing (Just "LY") [] [] []
+       "locale=morocco"    -> Lang "ar" Nothing (Just "MA") [] [] []
+       "locale=mauritania" -> Lang "ar" Nothing (Just "MR") [] [] []
+       "locale=tunisia"    -> Lang "ar" Nothing (Just "TN") [] [] []
+       _                   -> Lang "ar" Nothing (Just "")   [] [] [])
   , ("german", \o -> case T.filter (/=' ') o of
-       "spelling=old" -> Lang "de" "" "DE" ["1901"]
+       "spelling=old" -> Lang "de" Nothing (Just "DE") ["1901"] [] []
        "variant=austrian,spelling=old"
-                       -> Lang "de" "" "AT" ["1901"]
-       "variant=austrian" -> Lang "de" "" "AT" []
+                       -> Lang "de" Nothing (Just "AT") ["1901"] [] []
+       "variant=austrian" -> Lang "de" Nothing (Just "AT") [] [] []
        "variant=swiss,spelling=old"
-                       -> Lang "de" "" "CH" ["1901"]
-       "variant=swiss" -> Lang "de" "" "CH" []
-       _ -> Lang "de" "" "" [])
-  , ("lsorbian", \_ -> Lang "dsb" "" "" [])
+                       -> Lang "de" Nothing (Just "CH") ["1901"] [] []
+       "variant=swiss" -> Lang "de" Nothing (Just "CH") [] [] []
+       _ -> Lang "de" Nothing Nothing [] [] [])
+  , ("lsorbian", \_ -> Lang "dsb" Nothing Nothing [] [] [])
   , ("greek", \o -> case T.filter (/=' ') o of
-       "variant=poly"    -> Lang "el" "" "polyton" []
-       "variant=ancient" -> Lang "grc" "" "" []
-       _                 -> Lang "el" "" "" [])
+       "variant=poly"    -> Lang "el" Nothing (Just "polyton") [] [] []
+       "variant=ancient" -> Lang "grc" Nothing Nothing [] [] []
+       _                 -> Lang "el" Nothing Nothing [] [] [])
   , ("english", \o -> case T.filter (/=' ') o of
-       "variant=australian" -> Lang "en" "" "AU" []
-       "variant=canadian"   -> Lang "en" "" "CA" []
-       "variant=british"    -> Lang "en" "" "GB" []
-       "variant=newzealand" -> Lang "en" "" "NZ" []
-       "variant=american"   -> Lang "en" "" "US" []
-       _                    -> Lang "en" "" "" [])
-  , ("usorbian", \_ -> Lang "hsb" "" "" [])
+       "variant=australian" -> Lang "en" Nothing (Just "AU") [] [] []
+       "variant=canadian"   -> Lang "en" Nothing (Just "CA") [] [] []
+       "variant=british"    -> Lang "en" Nothing (Just "GB") [] [] []
+       "variant=newzealand" -> Lang "en" Nothing (Just "NZ") [] [] []
+       "variant=american"   -> Lang "en" Nothing (Just "US") [] [] []
+       _                    -> Lang "en" Nothing (Just "")   [] [] [])
+  , ("usorbian", \_ -> Lang "hsb" Nothing Nothing [] [] [])
   , ("latin", \o -> case T.filter (/=' ') o of
-       "variant=classic" -> Lang "la" "" "" ["x-classic"]
-       _                 -> Lang "la" "" "" [])
-  , ("slovenian", \_ -> Lang "sl" "" "" [])
-  , ("serbianc", \_ -> Lang "sr" "cyrl" "" [])
-  , ("pinyin", \_ -> Lang "zh" "Latn" "" ["pinyin"])
-  , ("afrikaans", \_ -> Lang "af" "" "" [])
-  , ("amharic", \_ -> Lang "am" "" "" [])
-  , ("assamese", \_ -> Lang "as" "" "" [])
-  , ("asturian", \_ -> Lang "ast" "" "" [])
-  , ("bulgarian", \_ -> Lang "bg" "" "" [])
-  , ("bengali", \_ -> Lang "bn" "" "" [])
-  , ("tibetan", \_ -> Lang "bo" "" "" [])
-  , ("breton", \_ -> Lang "br" "" "" [])
-  , ("catalan", \_ -> Lang "ca" "" "" [])
-  , ("welsh", \_ -> Lang "cy" "" "" [])
-  , ("czech", \_ -> Lang "cs" "" "" [])
-  , ("coptic", \_ -> Lang "cop" "" "" [])
-  , ("danish", \_ -> Lang "da" "" "" [])
-  , ("divehi", \_ -> Lang "dv" "" "" [])
-  , ("esperanto", \_ -> Lang "eo" "" "" [])
-  , ("spanish", \_ -> Lang "es" "" "" [])
-  , ("estonian", \_ -> Lang "et" "" "" [])
-  , ("basque", \_ -> Lang "eu" "" "" [])
-  , ("farsi", \_ -> Lang "fa" "" "" [])
-  , ("finnish", \_ -> Lang "fi" "" "" [])
-  , ("french", \_ -> Lang "fr" "" "" [])
-  , ("friulan", \_ -> Lang "fur" "" "" [])
-  , ("irish", \_ -> Lang "ga" "" "" [])
-  , ("scottish", \_ -> Lang "gd" "" "" [])
-  , ("ethiopic", \_ -> Lang "gez" "" "" [])
-  , ("galician", \_ -> Lang "gl" "" "" [])
-  , ("hebrew", \_ -> Lang "he" "" "" [])
-  , ("hindi", \_ -> Lang "hi" "" "" [])
-  , ("croatian", \_ -> Lang "hr" "" "" [])
-  , ("magyar", \_ -> Lang "hu" "" "" [])
-  , ("armenian", \_ -> Lang "hy" "" "" [])
-  , ("interlingua", \_ -> Lang "ia" "" "" [])
-  , ("indonesian", \_ -> Lang "id" "" "" [])
-  , ("icelandic", \_ -> Lang "is" "" "" [])
-  , ("italian", \_ -> Lang "it" "" "" [])
-  , ("japanese", \_ -> Lang "jp" "" "" [])
-  , ("khmer", \_ -> Lang "km" "" "" [])
-  , ("kurmanji", \_ -> Lang "kmr" "" "" [])
-  , ("kannada", \_ -> Lang "kn" "" "" [])
-  , ("korean", \_ -> Lang "ko" "" "" [])
-  , ("lao", \_ -> Lang "lo" "" "" [])
-  , ("lithuanian", \_ -> Lang "lt" "" "" [])
-  , ("latvian", \_ -> Lang "lv" "" "" [])
-  , ("malayalam", \_ -> Lang "ml" "" "" [])
-  , ("mongolian", \_ -> Lang "mn" "" "" [])
-  , ("marathi", \_ -> Lang "mr" "" "" [])
-  , ("dutch", \_ -> Lang "nl" "" "" [])
-  , ("nynorsk", \_ -> Lang "nn" "" "" [])
-  , ("norsk", \_ -> Lang "no" "" "" [])
-  , ("nko", \_ -> Lang "nqo" "" "" [])
-  , ("occitan", \_ -> Lang "oc" "" "" [])
-  , ("panjabi", \_ -> Lang "pa" "" "" [])
-  , ("polish", \_ -> Lang "pl" "" "" [])
-  , ("piedmontese", \_ -> Lang "pms" "" "" [])
-  , ("portuguese", \_ -> Lang "pt" "" "" [])
-  , ("romansh", \_ -> Lang "rm" "" "" [])
-  , ("romanian", \_ -> Lang "ro" "" "" [])
-  , ("russian", \_ -> Lang "ru" "" "" [])
-  , ("sanskrit", \_ -> Lang "sa" "" "" [])
-  , ("samin", \_ -> Lang "se" "" "" [])
-  , ("slovak", \_ -> Lang "sk" "" "" [])
-  , ("albanian", \_ -> Lang "sq" "" "" [])
-  , ("serbian", \_ -> Lang "sr" "" "" [])
-  , ("swedish", \_ -> Lang "sv" "" "" [])
-  , ("syriac", \_ -> Lang "syr" "" "" [])
-  , ("tamil", \_ -> Lang "ta" "" "" [])
-  , ("telugu", \_ -> Lang "te" "" "" [])
-  , ("thai", \_ -> Lang "th" "" "" [])
-  , ("turkmen", \_ -> Lang "tk" "" "" [])
-  , ("turkish", \_ -> Lang "tr" "" "" [])
-  , ("ukrainian", \_ -> Lang "uk" "" "" [])
-  , ("urdu", \_ -> Lang "ur" "" "" [])
-  , ("vietnamese", \_ -> Lang "vi" "" "" [])
+       "variant=classic" -> Lang "la" Nothing Nothing ["x-classic"] [] []
+       _                 -> Lang "la" Nothing Nothing [] [] [])
+  , ("slovenian", \_ -> Lang "sl" Nothing Nothing [] [] [])
+  , ("serbianc", \_ -> Lang "sr" (Just "Cyrl") Nothing [] [] [])
+  , ("pinyin", \_ -> Lang "zh" (Just "Latn") Nothing ["pinyin"] [] [])
+  , ("afrikaans", \_ -> simpleLang "af")
+  , ("amharic", \_ -> simpleLang "am")
+  , ("assamese", \_ -> simpleLang "as")
+  , ("asturian", \_ -> simpleLang "ast")
+  , ("bulgarian", \_ -> simpleLang "bg")
+  , ("bengali", \_ -> simpleLang "bn")
+  , ("tibetan", \_ -> simpleLang "bo")
+  , ("breton", \_ -> simpleLang "br")
+  , ("catalan", \_ -> simpleLang "ca")
+  , ("welsh", \_ -> simpleLang "cy")
+  , ("czech", \_ -> simpleLang "cs")
+  , ("coptic", \_ -> simpleLang "cop")
+  , ("danish", \_ -> simpleLang "da")
+  , ("divehi", \_ -> simpleLang "dv")
+  , ("esperanto", \_ -> simpleLang "eo")
+  , ("spanish", \_ -> simpleLang "es")
+  , ("estonian", \_ -> simpleLang "et")
+  , ("basque", \_ -> simpleLang "eu")
+  , ("farsi", \_ -> simpleLang "fa")
+  , ("finnish", \_ -> simpleLang "fi")
+  , ("french", \_ -> simpleLang "fr")
+  , ("friulan", \_ -> simpleLang "fur")
+  , ("irish", \_ -> simpleLang "ga")
+  , ("scottish", \_ -> simpleLang "gd")
+  , ("ethiopic", \_ -> simpleLang "gez")
+  , ("galician", \_ -> simpleLang "gl")
+  , ("hebrew", \_ -> simpleLang "he")
+  , ("hindi", \_ -> simpleLang "hi")
+  , ("croatian", \_ -> simpleLang "hr")
+  , ("magyar", \_ -> simpleLang "hu")
+  , ("armenian", \_ -> simpleLang "hy")
+  , ("interlingua", \_ -> simpleLang "ia")
+  , ("indonesian", \_ -> simpleLang "id")
+  , ("icelandic", \_ -> simpleLang "is")
+  , ("italian", \_ -> simpleLang "it")
+  , ("japanese", \_ -> simpleLang "jp")
+  , ("khmer", \_ -> simpleLang "km")
+  , ("kurmanji", \_ -> simpleLang "kmr")
+  , ("kannada", \_ -> simpleLang "kn")
+  , ("korean", \_ -> simpleLang "ko")
+  , ("lao", \_ -> simpleLang "lo")
+  , ("lithuanian", \_ -> simpleLang "lt")
+  , ("latvian", \_ -> simpleLang "lv")
+  , ("malayalam", \_ -> simpleLang "ml")
+  , ("mongolian", \_ -> simpleLang "mn")
+  , ("marathi", \_ -> simpleLang "mr")
+  , ("dutch", \_ -> simpleLang "nl")
+  , ("nynorsk", \_ -> simpleLang "nn")
+  , ("norsk", \_ -> simpleLang "no")
+  , ("nko", \_ -> simpleLang "nqo")
+  , ("occitan", \_ -> simpleLang "oc")
+  , ("panjabi", \_ -> simpleLang "pa")
+  , ("polish", \_ -> simpleLang "pl")
+  , ("piedmontese", \_ -> simpleLang "pms")
+  , ("portuguese", \_ -> simpleLang "pt")
+  , ("romansh", \_ -> simpleLang "rm")
+  , ("romanian", \_ -> simpleLang "ro")
+  , ("russian", \_ -> simpleLang "ru")
+  , ("sanskrit", \_ -> simpleLang "sa")
+  , ("samin", \_ -> simpleLang "se")
+  , ("slovak", \_ -> simpleLang "sk")
+  , ("albanian", \_ -> simpleLang "sq")
+  , ("serbian", \_ -> simpleLang "sr")
+  , ("swedish", \_ -> simpleLang "sv")
+  , ("syriac", \_ -> simpleLang "syr")
+  , ("tamil", \_ -> simpleLang "ta")
+  , ("telugu", \_ -> simpleLang "te")
+  , ("thai", \_ -> simpleLang "th")
+  , ("turkmen", \_ -> simpleLang "tk")
+  , ("turkish", \_ -> simpleLang "tr")
+  , ("ukrainian", \_ -> simpleLang "uk")
+  , ("urdu", \_ -> simpleLang "ur")
+  , ("vietnamese", \_ -> simpleLang "vi")
   ]
 
+simpleLang :: Text -> Lang
+simpleLang l = Lang l Nothing Nothing [] [] []
+
 babelLangToBCP47 :: T.Text -> Maybe Lang
 babelLangToBCP47 s =
   case s of
-       "austrian" -> Just $ Lang "de" "" "AT" ["1901"]
-       "naustrian" -> Just $ Lang "de" "" "AT" []
-       "swissgerman" -> Just $ Lang "de" "" "CH" ["1901"]
-       "nswissgerman" -> Just $ Lang "de" "" "CH" []
-       "german" -> Just $ Lang "de" "" "DE" ["1901"]
-       "ngerman" -> Just $ Lang "de" "" "DE" []
-       "lowersorbian" -> Just $ Lang "dsb" "" "" []
-       "uppersorbian" -> Just $ Lang "hsb" "" "" []
-       "polutonikogreek" -> Just $ Lang "el" "" "" ["polyton"]
-       "slovene" -> Just $ Lang "sl" "" "" []
-       "australian" -> Just $ Lang "en" "" "AU" []
-       "canadian" -> Just $ Lang "en" "" "CA" []
-       "british" -> Just $ Lang "en" "" "GB" []
-       "newzealand" -> Just $ Lang "en" "" "NZ" []
-       "american" -> Just $ Lang "en" "" "US" []
-       "classiclatin" -> Just $ Lang "la" "" "" ["x-classic"]
+       "austrian" -> Just $ Lang "de" Nothing (Just "AT") ["1901"] [] []
+       "naustrian" -> Just $ Lang "de" Nothing (Just "AT") [] [] []
+       "swissgerman" -> Just $ Lang "de" Nothing (Just "CH") ["1901"] [] []
+       "nswissgerman" -> Just $ Lang "de" Nothing (Just "CH") [] [] []
+       "german" -> Just $ Lang "de" Nothing (Just "DE") ["1901"] [] []
+       "ngerman" -> Just $ Lang "de" Nothing (Just "DE") [] [] []
+       "lowersorbian" -> Just $ Lang "dsb" Nothing Nothing [] [] []
+       "uppersorbian" -> Just $ Lang "hsb" Nothing Nothing [] [] []
+       "polutonikogreek" -> Just $ Lang "el" Nothing Nothing ["polyton"] [] []
+       "slovene" -> Just $ simpleLang "sl"
+       "australian" -> Just $ Lang "en" Nothing (Just "AU") [] [] []
+       "canadian" -> Just $ Lang "en" Nothing (Just "CA") [] [] []
+       "british" -> Just $ Lang "en" Nothing (Just "GB") [] [] []
+       "newzealand" -> Just $ Lang "en" Nothing (Just "NZ") [] [] []
+       "american" -> Just $ Lang "en" Nothing (Just "US") [] [] []
+       "classiclatin" -> Just $ Lang "la" Nothing Nothing ["x-classic"] [] []
        _ -> ($ "") <$> M.lookup s polyglossiaLangToBCP47
diff --git a/src/Text/Pandoc/Writers/BibTeX.hs b/src/Text/Pandoc/Writers/BibTeX.hs
index b9ae0c13a..95de6b71f 100644
--- a/src/Text/Pandoc/Writers/BibTeX.hs
+++ b/src/Text/Pandoc/Writers/BibTeX.hs
@@ -43,7 +43,7 @@ writeBibTeX' :: PandocMonad m => Variant -> WriterOptions -> Pandoc -> m Text
 writeBibTeX' variant opts (Pandoc meta _) = do
   let mblang = case lookupMetaString "lang" meta of
                  "" -> Nothing
-                 t  -> Just $ parseLang t
+                 t  -> either (const Nothing) Just $ parseLang t
   let refs = case lookupMeta "references" meta of
                Just (MetaList xs) -> mapMaybe metaValueToReference xs
                _ -> []
diff --git a/src/Text/Pandoc/Writers/ConTeXt.hs b/src/Text/Pandoc/Writers/ConTeXt.hs
index 3c9975be8..f352c84bc 100644
--- a/src/Text/Pandoc/Writers/ConTeXt.hs
+++ b/src/Text/Pandoc/Writers/ConTeXt.hs
@@ -21,7 +21,7 @@ import Data.Maybe (mapMaybe)
 import Data.Text (Text)
 import qualified Data.Text as T
 import Network.URI (unEscapeString)
-import Text.Pandoc.BCP47
+import UnicodeCollation.Lang (Lang(..))
 import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang)
 import Text.Pandoc.Definition
 import Text.Pandoc.ImageSize
@@ -555,26 +555,26 @@ fromBCP47 mbs = fromBCP47' <$> toLang mbs
 -- https://tools.ietf.org/html/bcp47#section-2.1
 -- http://wiki.contextgarden.net/Language_Codes
 fromBCP47' :: Maybe Lang -> Maybe Text
-fromBCP47' (Just (Lang "ar" _ "SY" _)     ) = Just "ar-sy"
-fromBCP47' (Just (Lang "ar" _ "IQ" _)     ) = Just "ar-iq"
-fromBCP47' (Just (Lang "ar" _ "JO" _)     ) = Just "ar-jo"
-fromBCP47' (Just (Lang "ar" _ "LB" _)     ) = Just "ar-lb"
-fromBCP47' (Just (Lang "ar" _ "DZ" _)     ) = Just "ar-dz"
-fromBCP47' (Just (Lang "ar" _ "MA" _)     ) = Just "ar-ma"
-fromBCP47' (Just (Lang "de" _ _ ["1901"]) ) = Just "deo"
-fromBCP47' (Just (Lang "de" _ "DE" _)     ) = Just "de-de"
-fromBCP47' (Just (Lang "de" _ "AT" _)     ) = Just "de-at"
-fromBCP47' (Just (Lang "de" _ "CH" _)     ) = Just "de-ch"
-fromBCP47' (Just (Lang "el" _ _ ["poly"]) ) = Just "agr"
-fromBCP47' (Just (Lang "en" _ "US" _)     ) = Just "en-us"
-fromBCP47' (Just (Lang "en" _ "GB" _)     ) = Just "en-gb"
-fromBCP47' (Just (Lang "grc"_  _ _)       ) = Just "agr"
-fromBCP47' (Just (Lang "el" _ _ _)        ) = Just "gr"
-fromBCP47' (Just (Lang "eu" _ _ _)        ) = Just "ba"
-fromBCP47' (Just (Lang "he" _ _ _)        ) = Just "il"
-fromBCP47' (Just (Lang "jp" _ _ _)        ) = Just "ja"
-fromBCP47' (Just (Lang "uk" _ _ _)        ) = Just "ua"
-fromBCP47' (Just (Lang "vi" _ _ _)        ) = Just "vn"
-fromBCP47' (Just (Lang "zh" _ _ _)        ) = Just "cn"
-fromBCP47' (Just (Lang l _ _ _)           ) = Just l
-fromBCP47' Nothing                          = Nothing
+fromBCP47' (Just (Lang "ar" _ (Just "SY") _ _ _)) = Just "ar-sy"
+fromBCP47' (Just (Lang "ar" _ (Just "IQ") _ _ _)) = Just "ar-iq"
+fromBCP47' (Just (Lang "ar" _ (Just "JO") _ _ _)) = Just "ar-jo"
+fromBCP47' (Just (Lang "ar" _ (Just "LB") _ _ _)) = Just "ar-lb"
+fromBCP47' (Just (Lang "ar" _ (Just "DZ") _ _ _)) = Just "ar-dz"
+fromBCP47' (Just (Lang "ar" _ (Just "MA") _ _ _)) = Just "ar-ma"
+fromBCP47' (Just (Lang "de" _ _ ["1901"] _ _))    = Just "deo"
+fromBCP47' (Just (Lang "de" _ (Just "DE") _ _ _)) = Just "de-de"
+fromBCP47' (Just (Lang "de" _ (Just "AT") _ _ _)) = Just "de-at"
+fromBCP47' (Just (Lang "de" _ (Just "CH") _ _ _)) = Just "de-ch"
+fromBCP47' (Just (Lang "el" _ _ ["poly"] _ _))    = Just "agr"
+fromBCP47' (Just (Lang "en" _ (Just "US") _ _ _)) = Just "en-us"
+fromBCP47' (Just (Lang "en" _ (Just "GB") _ _ _)) = Just "en-gb"
+fromBCP47' (Just (Lang "grc"_ _ _ _ _))           = Just "agr"
+fromBCP47' (Just (Lang "el" _ _ _ _ _))           = Just "gr"
+fromBCP47' (Just (Lang "eu" _ _ _ _ _))           = Just "ba"
+fromBCP47' (Just (Lang "he" _ _ _ _ _))           = Just "il"
+fromBCP47' (Just (Lang "jp" _ _ _ _ _))           = Just "ja"
+fromBCP47' (Just (Lang "uk" _ _ _ _ _))           = Just "ua"
+fromBCP47' (Just (Lang "vi" _ _ _ _ _))           = Just "vn"
+fromBCP47' (Just (Lang "zh" _ _ _ _ _))           = Just "cn"
+fromBCP47' (Just (Lang l _ _ _ _ _))              = Just l
+fromBCP47' Nothing                                = Nothing
diff --git a/src/Text/Pandoc/Writers/CslJson.hs b/src/Text/Pandoc/Writers/CslJson.hs
index a10def95e..395335667 100644
--- a/src/Text/Pandoc/Writers/CslJson.hs
+++ b/src/Text/Pandoc/Writers/CslJson.hs
@@ -34,15 +34,16 @@ import Control.Monad.Identity
 import Citeproc.Locale (getLocale)
 import Citeproc.CslJson
 import Text.Pandoc.Options (WriterOptions)
-import Data.Maybe (mapMaybe)
+import Data.Maybe (mapMaybe, fromMaybe)
 import Data.Aeson.Encode.Pretty         (Config (..), Indent (Spaces),
                                          NumberFormat (Generic),
                                          defConfig, encodePretty')
 
 writeCslJson :: PandocMonad m => WriterOptions -> Pandoc -> m Text
 writeCslJson _opts (Pandoc meta _) = do
-  let lang = maybe (Lang "en" (Just "US")) parseLang
-               (lookupMeta "lang" meta >>= metaValueToText)
+  let lang = fromMaybe (Lang "en" Nothing (Just "US") [] [] [])
+               (lookupMeta "lang" meta >>= metaValueToText >>=
+                  either (const Nothing) Just . parseLang)
   locale <- case getLocale lang of
                Left e  -> throwError $ PandocCiteprocError e
                Right l -> return l
diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs
index 20bcd0324..7781df8e7 100644
--- a/src/Text/Pandoc/Writers/Docx.hs
+++ b/src/Text/Pandoc/Writers/Docx.hs
@@ -36,7 +36,7 @@ import qualified Data.Text.Lazy as TL
 import Data.Time.Clock.POSIX
 import Data.Digest.Pure.SHA (sha1, showDigest)
 import Skylighting
-import Text.Pandoc.BCP47 (getLang, renderLang)
+import UnicodeCollation.Lang (renderLang)
 import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang)
 import qualified Text.Pandoc.Class.PandocMonad as P
 import Data.Time
diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs
index 1c970e6ad..e99bad738 100644
--- a/src/Text/Pandoc/Writers/LaTeX.hs
+++ b/src/Text/Pandoc/Writers/LaTeX.hs
@@ -29,7 +29,7 @@ import qualified Data.Text as T
 import Network.URI (unEscapeString)
 import Text.DocTemplates (FromContext(lookupContext), renderTemplate,
                           Val(..), Context(..))
-import Text.Pandoc.BCP47 (Lang (..), getLang, renderLang)
+import UnicodeCollation.Lang (Lang (..), renderLang)
 import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang)
 import Text.Pandoc.Definition
 import Text.Pandoc.Highlighting (formatLaTeXBlock, formatLaTeXInline, highlight,
diff --git a/src/Text/Pandoc/Writers/LaTeX/Lang.hs b/src/Text/Pandoc/Writers/LaTeX/Lang.hs
index 871b2692a..437b84120 100644
--- a/src/Text/Pandoc/Writers/LaTeX/Lang.hs
+++ b/src/Text/Pandoc/Writers/LaTeX/Lang.hs
@@ -46,7 +46,7 @@ toPolyglossia (Lang "de" _ (Just "AT") vars _ _)
 toPolyglossia (Lang "de" _ (Just "AT") _ _ _)  = ("german", "variant=austrian")
 toPolyglossia (Lang "de" _ (Just "CH") vars _ _)
   | "1901" `elem` vars        = ("german", "variant=swiss, spelling=old")
-toPolyglossia (Lang "de" _ (Just "CH") _ _ _ _) = ("german", "variant=swiss")
+toPolyglossia (Lang "de" _ (Just "CH") _ _ _) = ("german", "variant=swiss")
 toPolyglossia (Lang "de" _ _ _ _ _)           = ("german", "")
 toPolyglossia (Lang "dsb" _ _ _ _ _)          = ("lsorbian", "")
 toPolyglossia (Lang "el" _ _ vars _ _)
@@ -61,9 +61,9 @@ toPolyglossia (Lang "grc" _ _ _ _ _)          = ("greek",   "variant=ancient")
 toPolyglossia (Lang "hsb" _ _ _ _ _)          = ("usorbian", "")
 toPolyglossia (Lang "la" _ _ vars _ _)
   | "x-classic" `elem` vars                   = ("latin", "variant=classic")
-toPolyglossia (Lang "pt" _ "BR" _ _ _)        = ("portuguese", "variant=brazilian")
+toPolyglossia (Lang "pt" _ (Just "BR") _ _ _) = ("portuguese", "variant=brazilian")
 toPolyglossia (Lang "sl" _ _ _ _ _)           = ("slovenian", "")
-toPolyglossia x                           = (commonFromBcp47 x, "")
+toPolyglossia x                               = (commonFromBcp47 x, "")
 
 -- Takes a list of the constituents of a BCP47 language code and
 -- converts it to a Babel language string.
@@ -81,7 +81,7 @@ toBabel (Lang "de" _ _ vars _ _)
   | "1901" `elem` vars                  = "german"
   | otherwise                           = "ngerman"
 toBabel (Lang "dsb" _ _ _ _ _)          = "lowersorbian"
-toBabel (Lang "el" _ _ vars)
+toBabel (Lang "el" _ _ vars _ _)
   | "polyton" `elem` vars               = "polutonikogreek"
 toBabel (Lang "en" _ (Just "AU") _ _ _) = "australian"
 toBabel (Lang "en" _ (Just "CA") _ _ _) = "canadian"
diff --git a/src/Text/Pandoc/Writers/ODT.hs b/src/Text/Pandoc/Writers/ODT.hs
index 101b236aa..6fd4cdeb4 100644
--- a/src/Text/Pandoc/Writers/ODT.hs
+++ b/src/Text/Pandoc/Writers/ODT.hs
@@ -16,6 +16,7 @@ import Codec.Archive.Zip
 import Control.Monad.Except (catchError, throwError)
 import Control.Monad.State.Strict
 import qualified Data.ByteString.Lazy as B
+import Data.Maybe (fromMaybe)
 import Data.Generics (everywhere', mkT)
 import Data.List (isPrefixOf)
 import qualified Data.Map as Map
@@ -23,7 +24,7 @@ import qualified Data.Text as T
 import qualified Data.Text.Lazy as TL
 import Data.Time
 import System.FilePath (takeDirectory, takeExtension, (<.>))
-import Text.Pandoc.BCP47 (Lang (..), getLang, renderLang)
+import UnicodeCollation.Lang (Lang (..), renderLang)
 import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang)
 import qualified Text.Pandoc.Class.PandocMonad as P
 import Text.Pandoc.Definition
@@ -35,7 +36,7 @@ import Text.Pandoc.Options (WrapOption (..), WriterOptions (..))
 import Text.DocLayout
 import Text.Pandoc.Shared (stringify, pandocVersion, tshow)
 import Text.Pandoc.Writers.Shared (lookupMetaString, lookupMetaBlocks,
-                                   fixDisplayMath)
+                                   fixDisplayMath, getLang)
 import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toTextLazy)
 import Text.Pandoc.Walk
 import Text.Pandoc.Writers.OpenDocument (writeOpenDocument)
@@ -194,7 +195,7 @@ addLang lang = everywhere' (mkT updateLangAttr)
     where updateLangAttr (Attr n@(QName "language" _ (Just "fo")) _)
                            = Attr n (langLanguage lang)
           updateLangAttr (Attr n@(QName "country" _ (Just "fo")) _)
-                           = Attr n (langRegion lang)
+                           = Attr n (fromMaybe "" $ langRegion lang)
           updateLangAttr x = x
 
 -- | transform both Image and Math elements
diff --git a/src/Text/Pandoc/Writers/OpenDocument.hs b/src/Text/Pandoc/Writers/OpenDocument.hs
index cf42f2228..6c265090c 100644
--- a/src/Text/Pandoc/Writers/OpenDocument.hs
+++ b/src/Text/Pandoc/Writers/OpenDocument.hs
@@ -25,7 +25,7 @@ import Data.Ord (comparing)
 import qualified Data.Set as Set
 import Data.Text (Text)
 import qualified Data.Text as T
-import Text.Pandoc.BCP47 (Lang (..), parseBCP47)
+import UnicodeCollation.Lang (Lang (..), parseLang)
 import Text.Pandoc.Class.PandocMonad (PandocMonad, report, translateTerm,
                                       setTranslations, toLang)
 import Text.Pandoc.Definition
@@ -236,7 +236,7 @@ handleSpaces s = case T.uncons s of
 -- | Convert Pandoc document to string in OpenDocument format.
 writeOpenDocument :: PandocMonad m => WriterOptions -> Pandoc -> m Text
 writeOpenDocument opts (Pandoc meta blocks) = do
-  let defLang = Lang "en" "US" "" []
+  let defLang = Lang "en" (Just "US") Nothing [] [] []
   lang <- case lookupMetaString "lang" meta of
             "" -> pure defLang
             s  -> fromMaybe defLang <$> toLang (Just s)
@@ -893,7 +893,7 @@ textStyleAttr m s
                     Map.insert "style:font-name-complex" "Courier New" $ m
     | Language lang <- s
                   = Map.insert "fo:language" (langLanguage lang) .
-                    Map.insert "fo:country" (langRegion lang) $ m
+                    maybe id (Map.insert "fo:country") (langRegion lang) $ m
     | otherwise   = m
 
 withLangFromAttr :: PandocMonad m => Attr -> OD m a -> OD m a
@@ -901,7 +901,7 @@ withLangFromAttr (_,_,kvs) action =
   case lookup "lang" kvs of
        Nothing -> action
        Just l  ->
-         case parseBCP47 l of
+         case parseLang l of
               Right lang -> withTextStyle (Language lang) action
               Left _ -> do
                 report $ InvalidLang l
diff --git a/src/Text/Pandoc/Writers/Shared.hs b/src/Text/Pandoc/Writers/Shared.hs
index fcb47bd5a..a09d18571 100644
--- a/src/Text/Pandoc/Writers/Shared.hs
+++ b/src/Text/Pandoc/Writers/Shared.hs
@@ -149,7 +149,7 @@ defField field val (Context m) =
     f _newval oldval = oldval
 
 -- | Get the contents of the `lang` metadata field or variable.
-getLang :: WriterOptions -> Meta -> Maybe Text
+getLang :: WriterOptions -> Meta -> Maybe T.Text
 getLang opts meta =
   case lookupContext "lang" (writerVariables opts) of
         Just s -> Just s
-- 
cgit v1.2.3


From a478a5c4c8753fd0bf272cd540ca197ae146a196 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Sat, 17 Apr 2021 11:47:54 -0700
Subject: Update to released unicode-collation, latest citeproc dev version.

Update citeproc test.
---
 cabal.project                           | 11 +----------
 src/Text/Pandoc/App.hs                  |  2 +-
 src/Text/Pandoc/Citeproc/Data.hs        |  2 +-
 src/Text/Pandoc/Class/CommonState.hs    |  2 +-
 src/Text/Pandoc/Class/PandocMonad.hs    |  2 +-
 src/Text/Pandoc/Readers/LaTeX.hs        |  2 +-
 src/Text/Pandoc/Readers/LaTeX/Lang.hs   |  2 +-
 src/Text/Pandoc/Writers/ConTeXt.hs      |  2 +-
 src/Text/Pandoc/Writers/Docbook.hs      |  2 +-
 src/Text/Pandoc/Writers/Docx.hs         |  2 +-
 src/Text/Pandoc/Writers/LaTeX.hs        |  2 +-
 src/Text/Pandoc/Writers/LaTeX/Lang.hs   |  2 +-
 src/Text/Pandoc/Writers/ODT.hs          |  2 +-
 src/Text/Pandoc/Writers/OpenDocument.hs |  2 +-
 stack.yaml                              |  5 ++---
 test/command/pandoc-citeproc-320a.md    |  8 ++++----
 16 files changed, 20 insertions(+), 30 deletions(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/cabal.project b/cabal.project
index fa17a20a6..77bc8ef88 100644
--- a/cabal.project
+++ b/cabal.project
@@ -5,14 +5,5 @@ flags: +embed_data_files
 source-repository-package
   type: git
   location: https://github.com/jgm/citeproc
-  tag: b42857be658b8f2649e989e061978e304986f853
+  tag: f9439e07e9271c7c2674a51efcad2fb8c663b2c8
 
-source-repository-package
-  type: git
-  location: https://github.com/jgm/unicode-collation
-  tag: 9d229a5c6bcbaf53d7022575234eb223cfa90d55
-
--- source-repository-package
---  type: git
---  location: https://github.com/jgm/citeproc
---  tag: d44e24696ab444090d0e63e321c3a573f68b2e74
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index 67d3cce7d..4e8c9f2ab 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -55,7 +55,7 @@ import Text.Pandoc.App.Opt (Opt (..), LineEnding (..), defaultOpts,
 import Text.Pandoc.App.CommandLineOptions (parseOptions, parseOptionsFromArgs,
                                            options)
 import Text.Pandoc.App.OutputSettings (OutputSettings (..), optToOutputSettings)
-import UnicodeCollation.Lang (Lang (..), parseLang)
+import Text.Collate.Lang (Lang (..), parseLang)
 import Text.Pandoc.Filter (Filter (JSONFilter, LuaFilter), applyFilters)
 import Text.Pandoc.PDF (makePDF)
 import Text.Pandoc.SelfContained (makeSelfContained)
diff --git a/src/Text/Pandoc/Citeproc/Data.hs b/src/Text/Pandoc/Citeproc/Data.hs
index 388b9ba62..848a83a1e 100644
--- a/src/Text/Pandoc/Citeproc/Data.hs
+++ b/src/Text/Pandoc/Citeproc/Data.hs
@@ -10,7 +10,7 @@ import qualified Data.Text.Encoding as TE
 import qualified Data.Text as T
 import Data.Text (Text)
 import Text.Pandoc.Citeproc.Util (toIETF)
-import UnicodeCollation.Lang (Lang(..), parseLang)
+import Text.Collate.Lang (Lang(..), parseLang)
 
 biblatexLocalizations :: [(FilePath, ByteString)]
 biblatexLocalizations = $(embedDir "citeproc/biblatex-localization")
diff --git a/src/Text/Pandoc/Class/CommonState.hs b/src/Text/Pandoc/Class/CommonState.hs
index 0fd094d99..796a4afd5 100644
--- a/src/Text/Pandoc/Class/CommonState.hs
+++ b/src/Text/Pandoc/Class/CommonState.hs
@@ -19,7 +19,7 @@ where
 
 import Data.Default (Default (def))
 import Data.Text (Text)
-import UnicodeCollation.Lang (Lang)
+import Text.Collate.Lang (Lang)
 import Text.Pandoc.MediaBag (MediaBag)
 import Text.Pandoc.Logging (LogMessage, Verbosity (WARNING))
 import Text.Pandoc.Translations (Translations)
diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index 76f1fa32b..7559cd7cd 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -70,7 +70,7 @@ import Network.URI ( escapeURIString, nonStrictRelativeTo,
 import System.FilePath ((</>), (<.>), takeExtension, dropExtension,
                         isRelative, splitDirectories)
 import System.Random (StdGen)
-import UnicodeCollation.Lang (Lang(..), parseLang, renderLang)
+import Text.Collate.Lang (Lang(..), parseLang, renderLang)
 import Text.Pandoc.Class.CommonState (CommonState (..))
 import Text.Pandoc.Definition
 import Text.Pandoc.Error
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index 83caf742a..203dab83c 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -33,7 +33,7 @@ import qualified Data.Set as Set
 import Data.Text (Text)
 import qualified Data.Text as T
 import System.FilePath (addExtension, replaceExtension, takeExtension)
-import UnicodeCollation.Lang (renderLang)
+import Text.Collate.Lang (renderLang)
 import Text.Pandoc.Builder as B
 import Text.Pandoc.Class.PandocPure (PandocPure)
 import Text.Pandoc.Class.PandocMonad (PandocMonad (..), getResourcePath,
diff --git a/src/Text/Pandoc/Readers/LaTeX/Lang.hs b/src/Text/Pandoc/Readers/LaTeX/Lang.hs
index b92e6ab57..6a8327904 100644
--- a/src/Text/Pandoc/Readers/LaTeX/Lang.hs
+++ b/src/Text/Pandoc/Readers/LaTeX/Lang.hs
@@ -23,7 +23,7 @@ import qualified Data.Map as M
 import Data.Text (Text)
 import qualified Data.Text as T
 import Text.Pandoc.Shared (extractSpaces)
-import UnicodeCollation.Lang (Lang(..), renderLang)
+import Text.Collate.Lang (Lang(..), renderLang)
 import Text.Pandoc.Class (PandocMonad(..), setTranslations)
 import Text.Pandoc.Readers.LaTeX.Parsing
 import Text.Pandoc.Parsing (updateState, option, getState, QuoteContext(..),
diff --git a/src/Text/Pandoc/Writers/ConTeXt.hs b/src/Text/Pandoc/Writers/ConTeXt.hs
index f352c84bc..f14b1d894 100644
--- a/src/Text/Pandoc/Writers/ConTeXt.hs
+++ b/src/Text/Pandoc/Writers/ConTeXt.hs
@@ -21,7 +21,7 @@ import Data.Maybe (mapMaybe)
 import Data.Text (Text)
 import qualified Data.Text as T
 import Network.URI (unEscapeString)
-import UnicodeCollation.Lang (Lang(..))
+import Text.Collate.Lang (Lang(..))
 import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang)
 import Text.Pandoc.Definition
 import Text.Pandoc.ImageSize
diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs
index 1f10c9d04..02b141250 100644
--- a/src/Text/Pandoc/Writers/Docbook.hs
+++ b/src/Text/Pandoc/Writers/Docbook.hs
@@ -493,4 +493,4 @@ isSectionAttr DocBook4 ("os",_) = True
 isSectionAttr DocBook4 ("revision",_) = True
 isSectionAttr DocBook4 ("security",_) = True
 isSectionAttr DocBook4 ("vendor",_) = True
-isSectionAttr _ (_,_) = False
\ No newline at end of file
+isSectionAttr _ (_,_) = False
diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs
index 7781df8e7..749ad9a21 100644
--- a/src/Text/Pandoc/Writers/Docx.hs
+++ b/src/Text/Pandoc/Writers/Docx.hs
@@ -36,7 +36,7 @@ import qualified Data.Text.Lazy as TL
 import Data.Time.Clock.POSIX
 import Data.Digest.Pure.SHA (sha1, showDigest)
 import Skylighting
-import UnicodeCollation.Lang (renderLang)
+import Text.Collate.Lang (renderLang)
 import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang)
 import qualified Text.Pandoc.Class.PandocMonad as P
 import Data.Time
diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs
index e99bad738..8b1f3df1d 100644
--- a/src/Text/Pandoc/Writers/LaTeX.hs
+++ b/src/Text/Pandoc/Writers/LaTeX.hs
@@ -29,7 +29,7 @@ import qualified Data.Text as T
 import Network.URI (unEscapeString)
 import Text.DocTemplates (FromContext(lookupContext), renderTemplate,
                           Val(..), Context(..))
-import UnicodeCollation.Lang (Lang (..), renderLang)
+import Text.Collate.Lang (Lang (..), renderLang)
 import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang)
 import Text.Pandoc.Definition
 import Text.Pandoc.Highlighting (formatLaTeXBlock, formatLaTeXInline, highlight,
diff --git a/src/Text/Pandoc/Writers/LaTeX/Lang.hs b/src/Text/Pandoc/Writers/LaTeX/Lang.hs
index 437b84120..0ba68b74e 100644
--- a/src/Text/Pandoc/Writers/LaTeX/Lang.hs
+++ b/src/Text/Pandoc/Writers/LaTeX/Lang.hs
@@ -15,7 +15,7 @@ module Text.Pandoc.Writers.LaTeX.Lang
     toBabel
   ) where
 import Data.Text (Text)
-import UnicodeCollation.Lang (Lang(..))
+import Text.Collate.Lang (Lang(..))
 
 
 -- In environments \Arabic instead of \arabic is used
diff --git a/src/Text/Pandoc/Writers/ODT.hs b/src/Text/Pandoc/Writers/ODT.hs
index 6fd4cdeb4..e4eb4fd25 100644
--- a/src/Text/Pandoc/Writers/ODT.hs
+++ b/src/Text/Pandoc/Writers/ODT.hs
@@ -24,7 +24,7 @@ import qualified Data.Text as T
 import qualified Data.Text.Lazy as TL
 import Data.Time
 import System.FilePath (takeDirectory, takeExtension, (<.>))
-import UnicodeCollation.Lang (Lang (..), renderLang)
+import Text.Collate.Lang (Lang (..), renderLang)
 import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang)
 import qualified Text.Pandoc.Class.PandocMonad as P
 import Text.Pandoc.Definition
diff --git a/src/Text/Pandoc/Writers/OpenDocument.hs b/src/Text/Pandoc/Writers/OpenDocument.hs
index 6c265090c..34a3a4aa5 100644
--- a/src/Text/Pandoc/Writers/OpenDocument.hs
+++ b/src/Text/Pandoc/Writers/OpenDocument.hs
@@ -25,7 +25,7 @@ import Data.Ord (comparing)
 import qualified Data.Set as Set
 import Data.Text (Text)
 import qualified Data.Text as T
-import UnicodeCollation.Lang (Lang (..), parseLang)
+import Text.Collate.Lang (Lang (..), parseLang)
 import Text.Pandoc.Class.PandocMonad (PandocMonad, report, translateTerm,
                                       setTranslations, toLang)
 import Text.Pandoc.Definition
diff --git a/stack.yaml b/stack.yaml
index 159bf74b9..7bc33fa43 100644
--- a/stack.yaml
+++ b/stack.yaml
@@ -16,11 +16,10 @@ extra-deps:
 - texmath-0.12.2
 - random-1.2.0
 - xml-conduit-1.9.1.1
+- unicode-collation-0.1
 # - citeproc-0.3.0.9
 - git: https://github.com/jgm/citeproc
-  commit: b42857be658b8f2649e989e061978e304986f853
-- git: https://github.com/jgm/unicode-collation
-  commit: 9d229a5c6bcbaf53d7022575234eb223cfa90d55
+  commit: f9439e07e9271c7c2674a51efcad2fb8c663b2c8
 ghc-options:
    "$locals": -fhide-source-paths -Wno-missing-home-modules
 resolver: lts-17.5
diff --git a/test/command/pandoc-citeproc-320a.md b/test/command/pandoc-citeproc-320a.md
index 1c3b47de0..79dacfa10 100644
--- a/test/command/pandoc-citeproc-320a.md
+++ b/test/command/pandoc-citeproc-320a.md
@@ -56,6 +56,10 @@ n.d.; al-'Udhrī, n.d.; Uch, n.d.; Uebel, n.d.; Zzz, n.d.).
 Uch, Ann. n.d.
 :::
 
+::: {#ref-item4 .csl-entry}
+'Udhrī, Jamīl al-. n.d.
+:::
+
 ::: {#ref-item1 .csl-entry}
 ʾUdhrī, Jamīl al-. n.d.
 :::
@@ -68,10 +72,6 @@ Uch, Ann. n.d.
 \'Udhrī, Jamīl al-. n.d.
 :::
 
-::: {#ref-item4 .csl-entry}
-'Udhrī, Jamīl al-. n.d.
-:::
-
 ::: {#ref-item5 .csl-entry}
 'Udhrī, Jamīl al-. n.d.
 :::
-- 
cgit v1.2.3


From ddbd984a0d8ea7e75f78ad6632fe3568e2390deb Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Mon, 17 May 2021 09:31:52 -0700
Subject: Text.Pandoc.MediaBag: change type to use a Text key...

instead of `[FilePath]`.
We normalize the path and use `/` separators for consistency.
---
 src/Text/Pandoc/Class/PandocMonad.hs |  1 +
 src/Text/Pandoc/MediaBag.hs          | 20 +++++++++++++-------
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index 7559cd7cd..226194503 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -1,4 +1,5 @@
 {-# LANGUAGE CPP #-}
+{-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
diff --git a/src/Text/Pandoc/MediaBag.hs b/src/Text/Pandoc/MediaBag.hs
index 3249bcdeb..4a9b4efa1 100644
--- a/src/Text/Pandoc/MediaBag.hs
+++ b/src/Text/Pandoc/MediaBag.hs
@@ -1,4 +1,5 @@
 {-# LANGUAGE CPP                        #-}
+{-# LANGUAGE OverloadedStrings          #-}
 {-# LANGUAGE DeriveDataTypeable         #-}
 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
 {- |
@@ -27,26 +28,31 @@ import qualified Data.Map as M
 import Data.Maybe (fromMaybe)
 import Data.Typeable (Typeable)
 import System.FilePath
-import qualified System.FilePath.Posix as Posix
 import Text.Pandoc.MIME (MimeType, getMimeTypeDef)
+import Data.Text (Text)
+import qualified Data.Text as T
 
 -- | A container for a collection of binary resources, with names and
 -- mime types.  Note that a 'MediaBag' is a Monoid, so 'mempty'
 -- can be used for an empty 'MediaBag', and '<>' can be used to append
 -- two 'MediaBag's.
-newtype MediaBag = MediaBag (M.Map [FilePath] (MimeType, BL.ByteString))
+newtype MediaBag = MediaBag (M.Map Text (MimeType, BL.ByteString))
         deriving (Semigroup, Monoid, Data, Typeable)
 
 instance Show MediaBag where
   show bag = "MediaBag " ++ show (mediaDirectory bag)
 
+-- | We represent paths with /, in normalized form.
+canonicalize :: FilePath -> Text
+canonicalize = T.replace "\\" "/" . T.pack . normalise
+
 -- | Delete a media item from a 'MediaBag', or do nothing if no item corresponds
 -- to the given path.
 deleteMedia :: FilePath       -- ^ relative path and canonical name of resource
             -> MediaBag
             -> MediaBag
 deleteMedia fp (MediaBag mediamap) =
-  MediaBag $ M.delete (splitDirectories fp) mediamap
+  MediaBag $ M.delete (canonicalize fp) mediamap
 
 -- | Insert a media item into a 'MediaBag', replacing any existing
 -- value with the same name.
@@ -56,7 +62,7 @@ insertMedia :: FilePath       -- ^ relative path and canonical name of resource
             -> MediaBag
             -> MediaBag
 insertMedia fp mbMime contents (MediaBag mediamap) =
-  MediaBag (M.insert (splitDirectories fp) (mime, contents) mediamap)
+  MediaBag (M.insert (canonicalize fp) (mime, contents) mediamap)
   where mime = fromMaybe fallback mbMime
         fallback = case takeExtension fp of
                         ".gz" -> getMimeTypeDef $ dropExtension fp
@@ -66,16 +72,16 @@ insertMedia fp mbMime contents (MediaBag mediamap) =
 lookupMedia :: FilePath
             -> MediaBag
             -> Maybe (MimeType, BL.ByteString)
-lookupMedia fp (MediaBag mediamap) = M.lookup (splitDirectories fp) mediamap
+lookupMedia fp (MediaBag mediamap) = M.lookup (canonicalize fp) mediamap
 
 -- | Get a list of the file paths stored in a 'MediaBag', with
 -- their corresponding mime types and the lengths in bytes of the contents.
 mediaDirectory :: MediaBag -> [(FilePath, MimeType, Int)]
 mediaDirectory (MediaBag mediamap) =
   M.foldrWithKey (\fp (mime,contents) ->
-      ((Posix.joinPath fp, mime, fromIntegral $ BL.length contents):)) [] mediamap
+      ((T.unpack fp, mime, fromIntegral (BL.length contents)):)) [] mediamap
 
 mediaItems :: MediaBag -> [(FilePath, MimeType, BL.ByteString)]
 mediaItems (MediaBag mediamap) =
   M.foldrWithKey (\fp (mime,contents) ->
-      ((Posix.joinPath fp, mime, contents):)) [] mediamap
+      ((T.unpack fp, mime, contents):)) [] mediamap
-- 
cgit v1.2.3


From 9b5798bd9abe267f783dab7fe5295c9b61e2fdce Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Mon, 17 May 2021 21:22:46 -0700
Subject: Use fetchItem instead of downloadOrRead in fetchMediaResource.

---
 src/Text/Pandoc/Class/PandocMonad.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index 226194503..b12850de5 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -635,7 +635,7 @@ withPaths (p:ps) action fp =
 fetchMediaResource :: PandocMonad m
               => T.Text -> m (FilePath, Maybe MimeType, BL.ByteString)
 fetchMediaResource src = do
-  (bs, mt) <- downloadOrRead src
+  (bs, mt) <- fetchItem src
   let ext = fromMaybe (T.pack $ takeExtension $ T.unpack src)
                       (mt >>= extensionFromMimeType)
   let bs' = BL.fromChunks [bs]
-- 
cgit v1.2.3


From 640dbf8b8f5e652661df42c631b4343570d7448e Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Wed, 19 May 2021 09:51:50 -0700
Subject: Remove unused pragma.

---
 src/Text/Pandoc/Class/PandocMonad.hs | 1 -
 1 file changed, 1 deletion(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index b12850de5..dd6499a73 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -1,5 +1,4 @@
 {-# LANGUAGE CPP #-}
-{-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
-- 
cgit v1.2.3


From 8511f6fdf6c9fbc2cc926538bca4ae9f554b4ed9 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Sun, 23 May 2021 22:57:02 -0700
Subject: MediaBag improvements.

In the current dev version, we will sometimes add
a version of an image with a hashed name, keeping
the original version with the original name, which
would leave to undesirable duplication.

This change separates the media's filename from the
media's canonical name (which is the path of the link
in the document itself).  Filenames are based on SHA1
hashes and assigned automatically.

In Text.Pandoc.MediaBag:

- Export MediaItem type [API change].
- Change MediaBag type to a map from Text to MediaItem [API change].
- `lookupMedia` now returns a `MediaItem` [API change].
- Change `insertMedia` so it sets the `mediaPath` to
  a filename based on the SHA1 hash of the contents.
  This will be used when contents are extracted.

In Text.Pandoc.Class.PandocMonad:

- Remove `fetchMediaResource` [API change].

Lua MediaBag module has been changed minimally. In the future
it would be better, probably, to give Lua access to the full
MediaItem type.
---
 src/Text/Pandoc/Class/IO.hs            |  9 ++++---
 src/Text/Pandoc/Class/PandocMonad.hs   | 43 ++++++++++++++--------------------
 src/Text/Pandoc/Lua/Module/MediaBag.hs |  6 ++---
 src/Text/Pandoc/MediaBag.hs            | 35 ++++++++++++++++++++-------
 test/Tests/Readers/Docx.hs             | 10 ++++----
 5 files changed, 55 insertions(+), 48 deletions(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/src/Text/Pandoc/Class/IO.hs b/src/Text/Pandoc/Class/IO.hs
index bb4e2b732..f12c0a938 100644
--- a/src/Text/Pandoc/Class/IO.hs
+++ b/src/Text/Pandoc/Class/IO.hs
@@ -62,7 +62,7 @@ import Text.Pandoc.Definition (Pandoc, Inline (Image))
 import Text.Pandoc.Error (PandocError (..))
 import Text.Pandoc.Logging (LogMessage (..), messageVerbosity, showLogMessage)
 import Text.Pandoc.MIME (MimeType)
-import Text.Pandoc.MediaBag (MediaBag, lookupMedia, mediaDirectory)
+import Text.Pandoc.MediaBag (MediaBag, MediaItem(..), lookupMedia, mediaDirectory)
 import Text.Pandoc.Walk (walk)
 import qualified Control.Exception as E
 import qualified Data.ByteString as B
@@ -213,14 +213,13 @@ writeMedia :: (PandocMonad m, MonadIO m)
 writeMedia dir mediabag subpath = do
   -- we join and split to convert a/b/c to a\b\c on Windows;
   -- in zip containers all paths use /
-  let fullpath = dir </> unEscapeString (normalise subpath)
   let mbcontents = lookupMedia subpath mediabag
   case mbcontents of
        Nothing -> throwError $ PandocResourceNotFound $ pack subpath
-       Just (_, bs) -> do
-         report $ Extracting $ pack fullpath
+       Just item -> do
+         let fullpath = dir </> mediaPath item
          liftIOError (createDirectoryIfMissing True) (takeDirectory fullpath)
-         logIOError $ BL.writeFile fullpath bs
+         logIOError $ BL.writeFile fullpath $ mediaContents item
 
 -- | If the given Inline element is an image with a @src@ path equal to
 -- one in the list of @paths@, then prepends @dir@ to the image source;
diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index dd6499a73..ae6917e06 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -37,7 +37,6 @@ module Text.Pandoc.Class.PandocMonad
   , setUserDataDir
   , getUserDataDir
   , fetchItem
-  , fetchMediaResource
   , getInputFiles
   , setInputFiles
   , getOutputFile
@@ -57,8 +56,6 @@ module Text.Pandoc.Class.PandocMonad
 import Codec.Archive.Zip
 import Control.Monad.Except (MonadError (catchError, throwError),
                              MonadTrans, lift, when)
-import Data.Digest.Pure.SHA (sha1, showDigest)
-import Data.Maybe (fromMaybe)
 import Data.List (foldl')
 import Data.Time (UTCTime)
 import Data.Time.Clock.POSIX (POSIXTime, utcTimeToPOSIXSeconds,
@@ -67,7 +64,7 @@ import Data.Time.LocalTime (TimeZone, ZonedTime, utcToZonedTime)
 import Network.URI ( escapeURIString, nonStrictRelativeTo,
                      unEscapeString, parseURIReference, isAllowedInURI,
                      parseURI, URI(..) )
-import System.FilePath ((</>), (<.>), takeExtension, dropExtension,
+import System.FilePath ((</>), takeExtension, dropExtension,
                         isRelative, splitDirectories)
 import System.Random (StdGen)
 import Text.Collate.Lang (Lang(..), parseLang, renderLang)
@@ -75,8 +72,8 @@ import Text.Pandoc.Class.CommonState (CommonState (..))
 import Text.Pandoc.Definition
 import Text.Pandoc.Error
 import Text.Pandoc.Logging
-import Text.Pandoc.MIME (MimeType, getMimeType, extensionFromMimeType)
-import Text.Pandoc.MediaBag (MediaBag, lookupMedia)
+import Text.Pandoc.MIME (MimeType, getMimeType)
+import Text.Pandoc.MediaBag (MediaBag, lookupMedia, MediaItem(..))
 import Text.Pandoc.Shared (uriPathToPath, safeRead)
 import Text.Pandoc.Translations (Term(..), Translations, lookupTerm,
                                  readTranslations)
@@ -376,7 +373,8 @@ fetchItem :: PandocMonad m
 fetchItem s = do
   mediabag <- getMediaBag
   case lookupMedia (T.unpack s) mediabag of
-    Just (mime, bs) -> return (BL.toStrict bs, Just mime)
+    Just item -> return (BL.toStrict (mediaContents item),
+                         Just (mediaMimeType item))
     Nothing -> downloadOrRead s
 
 -- | Returns the content and, if available, the MIME type of a resource.
@@ -629,19 +627,6 @@ withPaths (p:ps) action fp =
   catchError (action (p </> fp))
              (\_ -> withPaths ps action fp)
 
--- | Fetch local or remote resource (like an image) and provide data suitable
--- for adding it to the MediaBag.
-fetchMediaResource :: PandocMonad m
-              => T.Text -> m (FilePath, Maybe MimeType, BL.ByteString)
-fetchMediaResource src = do
-  (bs, mt) <- fetchItem src
-  let ext = fromMaybe (T.pack $ takeExtension $ T.unpack src)
-                      (mt >>= extensionFromMimeType)
-  let bs' = BL.fromChunks [bs]
-  let basename = showDigest $ sha1 bs'
-  let fname = basename <.> T.unpack ext
-  return (fname, mt, bs')
-
 -- | Traverse tree, filling media bag for any images that
 -- aren't already in the media bag.
 fillMediaBag :: PandocMonad m => Pandoc -> m Pandoc
@@ -649,12 +634,18 @@ fillMediaBag d = walkM handleImage d
   where handleImage :: PandocMonad m => Inline -> m Inline
         handleImage (Image attr lab (src, tit)) = catchError
           (do mediabag <- getMediaBag
-              case lookupMedia (T.unpack src) mediabag of
-                Just (_, _) -> return $ Image attr lab (src, tit)
-                Nothing -> do
-                  (fname, mt, bs) <- fetchMediaResource src
-                  insertMedia fname mt bs
-                  return $ Image attr lab (T.pack fname, tit))
+              let fp = T.unpack src
+              src' <- T.pack <$> case lookupMedia fp mediabag of
+                        Just item -> return $ mediaPath item
+                        Nothing -> do
+                          (bs, mt) <- fetchItem src
+                          insertMedia fp mt (BL.fromStrict bs)
+                          mediabag' <- getMediaBag
+                          case lookupMedia fp mediabag' of
+                             Just item -> return $ mediaPath item
+                             Nothing -> throwError $ PandocSomeError $
+                               src <> " not successfully inserted into MediaBag"
+              return $ Image attr lab (src', tit))
           (\e ->
               case e of
                 PandocResourceNotFound _ -> do
diff --git a/src/Text/Pandoc/Lua/Module/MediaBag.hs b/src/Text/Pandoc/Lua/Module/MediaBag.hs
index 78b699176..3eed50fca 100644
--- a/src/Text/Pandoc/Lua/Module/MediaBag.hs
+++ b/src/Text/Pandoc/Lua/Module/MediaBag.hs
@@ -73,9 +73,9 @@ lookup fp = do
   res <- MB.lookupMedia fp <$> getMediaBag
   liftPandocLua $ case res of
     Nothing -> 1 <$ Lua.pushnil
-    Just (mimeType, contents) -> do
-      Lua.push mimeType
-      Lua.push contents
+    Just item -> do
+      Lua.push $ MB.mediaMimeType item
+      Lua.push $ MB.mediaContents item
       return 2
 
 list :: PandocLua NumResults
diff --git a/src/Text/Pandoc/MediaBag.hs b/src/Text/Pandoc/MediaBag.hs
index 4a9b4efa1..a65f315fc 100644
--- a/src/Text/Pandoc/MediaBag.hs
+++ b/src/Text/Pandoc/MediaBag.hs
@@ -15,6 +15,7 @@ Definition of a MediaBag object to hold binary resources, and an
 interface for interacting with it.
 -}
 module Text.Pandoc.MediaBag (
+                     MediaItem(..),
                      MediaBag,
                      deleteMedia,
                      lookupMedia,
@@ -28,15 +29,23 @@ import qualified Data.Map as M
 import Data.Maybe (fromMaybe)
 import Data.Typeable (Typeable)
 import System.FilePath
-import Text.Pandoc.MIME (MimeType, getMimeTypeDef)
+import Text.Pandoc.MIME (MimeType, getMimeTypeDef, extensionFromMimeType)
 import Data.Text (Text)
 import qualified Data.Text as T
+import Data.Digest.Pure.SHA (sha1, showDigest)
+
+data MediaItem =
+  MediaItem
+  { mediaMimeType :: MimeType
+  , mediaPath :: FilePath
+  , mediaContents :: BL.ByteString
+  } deriving (Eq, Ord, Show, Data, Typeable)
 
 -- | A container for a collection of binary resources, with names and
 -- mime types.  Note that a 'MediaBag' is a Monoid, so 'mempty'
 -- can be used for an empty 'MediaBag', and '<>' can be used to append
 -- two 'MediaBag's.
-newtype MediaBag = MediaBag (M.Map Text (MimeType, BL.ByteString))
+newtype MediaBag = MediaBag (M.Map Text MediaItem)
         deriving (Semigroup, Monoid, Data, Typeable)
 
 instance Show MediaBag where
@@ -62,26 +71,34 @@ insertMedia :: FilePath       -- ^ relative path and canonical name of resource
             -> MediaBag
             -> MediaBag
 insertMedia fp mbMime contents (MediaBag mediamap) =
-  MediaBag (M.insert (canonicalize fp) (mime, contents) mediamap)
-  where mime = fromMaybe fallback mbMime
+  MediaBag (M.insert (canonicalize fp) mediaItem mediamap)
+  where mediaItem = MediaItem{ mediaPath = showDigest (sha1 contents) <>
+                                 "." <> ext
+                             , mediaContents = contents
+                             , mediaMimeType = mt }
         fallback = case takeExtension fp of
                         ".gz" -> getMimeTypeDef $ dropExtension fp
                         _     -> getMimeTypeDef fp
+        mt = fromMaybe fallback mbMime
+        ext = maybe (takeExtension fp) T.unpack $ extensionFromMimeType mt
+
 
 -- | Lookup a media item in a 'MediaBag', returning mime type and contents.
 lookupMedia :: FilePath
             -> MediaBag
-            -> Maybe (MimeType, BL.ByteString)
+            -> Maybe MediaItem
 lookupMedia fp (MediaBag mediamap) = M.lookup (canonicalize fp) mediamap
 
 -- | Get a list of the file paths stored in a 'MediaBag', with
 -- their corresponding mime types and the lengths in bytes of the contents.
 mediaDirectory :: MediaBag -> [(FilePath, MimeType, Int)]
 mediaDirectory (MediaBag mediamap) =
-  M.foldrWithKey (\fp (mime,contents) ->
-      ((T.unpack fp, mime, fromIntegral (BL.length contents)):)) [] mediamap
+  M.foldrWithKey (\fp item ->
+      ((T.unpack fp, mediaMimeType item,
+        fromIntegral (BL.length (mediaContents item))):)) [] mediamap
 
 mediaItems :: MediaBag -> [(FilePath, MimeType, BL.ByteString)]
 mediaItems (MediaBag mediamap) =
-  M.foldrWithKey (\fp (mime,contents) ->
-      ((T.unpack fp, mime, contents):)) [] mediamap
+  M.foldrWithKey (\fp item ->
+      ((T.unpack fp, mediaMimeType item, mediaContents item):))
+     [] mediamap
diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs
index 2cce70cc5..939ff9939 100644
--- a/test/Tests/Readers/Docx.hs
+++ b/test/Tests/Readers/Docx.hs
@@ -24,7 +24,7 @@ import Test.Tasty.HUnit
 import Tests.Helpers
 import Text.Pandoc
 import qualified Text.Pandoc.Class as P
-import Text.Pandoc.MediaBag (MediaBag, lookupMedia, mediaDirectory)
+import qualified Text.Pandoc.MediaBag as MB
 import Text.Pandoc.UTF8 as UTF8
 
 -- We define a wrapper around pandoc that doesn't normalize in the
@@ -91,11 +91,11 @@ getMedia :: FilePath -> FilePath -> IO (Maybe B.ByteString)
 getMedia archivePath mediaPath = fmap fromEntry . findEntryByPath
     ("word/" ++ mediaPath) . toArchive <$> B.readFile archivePath
 
-compareMediaPathIO :: FilePath -> MediaBag -> FilePath -> IO Bool
+compareMediaPathIO :: FilePath -> MB.MediaBag -> FilePath -> IO Bool
 compareMediaPathIO mediaPath mediaBag docxPath = do
   docxMedia <- getMedia docxPath mediaPath
-  let mbBS   = case lookupMedia mediaPath mediaBag of
-                 Just (_, bs) -> bs
+  let mbBS   = case MB.lookupMedia mediaPath mediaBag of
+                 Just item    -> MB.mediaContents item
                  Nothing      -> error ("couldn't find " ++
                                         mediaPath ++
                                         " in media bag")
@@ -110,7 +110,7 @@ compareMediaBagIO docxFile = do
     mb <- runIOorExplode $ readDocx defopts df >> P.getMediaBag
     bools <- mapM
              (\(fp, _, _) -> compareMediaPathIO fp mb docxFile)
-             (mediaDirectory mb)
+             (MB.mediaDirectory mb)
     return $ and bools
 
 testMediaBagIO :: String -> FilePath -> IO TestTree
-- 
cgit v1.2.3


From f2c1b5746912db945be780961b6503e38c3c7e1e Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Tue, 25 May 2021 10:08:30 -0700
Subject: PandocMonad: add info message in `downloadOrRead`...

indicating what path local resources have been loaded from.
---
 src/Text/Pandoc/Class/PandocMonad.hs | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index ae6917e06..b5f401619 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -1,4 +1,5 @@
 {-# LANGUAGE CPP #-}
+{-# LANGUAGE TupleSections #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
@@ -409,9 +410,10 @@ downloadOrRead s = do
             _ -> readLocalFile fp -- get from local file system
    where readLocalFile f = do
              resourcePath <- getResourcePath
-             cont <- if isRelative f
-                        then withPaths resourcePath readFileStrict f
-                        else readFileStrict f
+             (fp', cont) <- if isRelative f
+                               then withPaths resourcePath readFileStrict f
+                               else (f,) <$> readFileStrict f
+             report $ LoadedResource f fp'
              return (cont, mime)
          httpcolon = URI{ uriScheme = "http:",
                           uriAuthority = Nothing,
@@ -621,10 +623,11 @@ makeCanonical = Posix.joinPath . transformPathParts . splitDirectories
 -- that filepath. Returns the result of the first successful execution
 -- of the action, or throws a @PandocResourceNotFound@ exception if the
 -- action errors for all filepaths.
-withPaths :: PandocMonad m => [FilePath] -> (FilePath -> m a) -> FilePath -> m a
+withPaths :: PandocMonad m
+          => [FilePath] -> (FilePath -> m a) -> FilePath -> m (FilePath, a)
 withPaths [] _ fp = throwError $ PandocResourceNotFound $ T.pack fp
 withPaths (p:ps) action fp =
-  catchError (action (p </> fp))
+  catchError ((p </> fp,) <$> action (p </> fp))
              (\_ -> withPaths ps action fp)
 
 -- | Traverse tree, filling media bag for any images that
-- 
cgit v1.2.3


From cc206af392a40dd7b01b714ae7f33b2fbf4925cc Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Sun, 30 May 2021 10:22:02 -0700
Subject: Have LoadedResource use relative paths.

The immediate reason for this is to allow the test output of #3752
to work on both windows and linux.
---
 src/Text/Pandoc/Class/PandocMonad.hs | 4 ++--
 test/command/3752.md                 | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index b5f401619..4eb80df29 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -66,7 +66,7 @@ import Network.URI ( escapeURIString, nonStrictRelativeTo,
                      unEscapeString, parseURIReference, isAllowedInURI,
                      parseURI, URI(..) )
 import System.FilePath ((</>), takeExtension, dropExtension,
-                        isRelative, splitDirectories)
+                        isRelative, splitDirectories, makeRelative)
 import System.Random (StdGen)
 import Text.Collate.Lang (Lang(..), parseLang, renderLang)
 import Text.Pandoc.Class.CommonState (CommonState (..))
@@ -413,7 +413,7 @@ downloadOrRead s = do
              (fp', cont) <- if isRelative f
                                then withPaths resourcePath readFileStrict f
                                else (f,) <$> readFileStrict f
-             report $ LoadedResource f fp'
+             report $ LoadedResource f (makeRelative "." fp')
              return (cont, mime)
          httpcolon = URI{ uriScheme = "http:",
                           uriAuthority = Nothing,
diff --git a/test/command/3752.md b/test/command/3752.md
index 6ac025ebe..863e3f2d4 100644
--- a/test/command/3752.md
+++ b/test/command/3752.md
@@ -1,9 +1,9 @@
 ```
 % pandoc command/chap1/text.md command/chap2/text.md -f markdown+rebase_relative_paths --verbose -t docx -o - | pandoc -f docx -t plain
 ^D
-[INFO] Loaded command/chap1/spider.png from ./command/chap1/spider.png
-[INFO] Loaded command/chap2/spider.png from ./command/chap2/spider.png
-[INFO] Loaded command/chap1/../../lalune.jpg from ./command/chap1/../../lalune.jpg
+[INFO] Loaded command/chap1/spider.png from command/chap1/spider.png
+[INFO] Loaded command/chap2/spider.png from command/chap2/spider.png
+[INFO] Loaded command/chap1/../../lalune.jpg from command/chap1/../../lalune.jpg
 Chapter one
 
 A spider: [spider]
-- 
cgit v1.2.3


From b6c04383e403b0962db09e6748760d3ec376f2ed Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Thu, 3 Jun 2021 18:34:38 -0600
Subject: T.P.Class.IO: normalise path in writeMedia.

This ensures that we get `\` separators on Windows.
---
 src/Text/Pandoc/Class/IO.hs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/src/Text/Pandoc/Class/IO.hs b/src/Text/Pandoc/Class/IO.hs
index f12c0a938..6df39d4d0 100644
--- a/src/Text/Pandoc/Class/IO.hs
+++ b/src/Text/Pandoc/Class/IO.hs
@@ -211,13 +211,12 @@ writeMedia :: (PandocMonad m, MonadIO m)
            => FilePath -> MediaBag -> FilePath
            -> m ()
 writeMedia dir mediabag subpath = do
-  -- we join and split to convert a/b/c to a\b\c on Windows;
-  -- in zip containers all paths use /
   let mbcontents = lookupMedia subpath mediabag
   case mbcontents of
        Nothing -> throwError $ PandocResourceNotFound $ pack subpath
        Just item -> do
-         let fullpath = dir </> mediaPath item
+         -- we normalize to get proper path separators for the platform
+         let fullpath = dir </> normalise (mediaPath item)
          liftIOError (createDirectoryIfMissing True) (takeDirectory fullpath)
          logIOError $ BL.writeFile fullpath $ mediaContents item
 
-- 
cgit v1.2.3


From 3776e828a83048697e5c64d9fb4bedc0145197dc Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Thu, 10 Jun 2021 16:47:02 -0700
Subject: Fix MediaBag regressions.

With the 2.14 release `--extract-media` stopped working as before;
there could be mismatches between the paths in the rendered document and
the extracted media.

This patch makes several changes (while keeping the same API).

The `mediaPath` in 2.14 was always constructed from the SHA1 hash of
the media contents.  Now, we preserve the original path unless it's
an absolute path or contains `..` segments (in that case we use a path
based on the SHA1 hash of the contents).

When constructing a path from the SHA1 hash, we always use the
original extension, if there is one. Otherwise we look up an
appropriate extension for the mime type.

`mediaDirectory` and `mediaItems` now use the `mediaPath`, rather
than the mediabag key, for the first component of the tuple.
This makes more sense, I think, and fits with the documentation
of these functions; eventually, though, we should rework the API so that
`mediaItems` returns both the keys and the MediaItems.

Rewriting of source paths in `extractMedia` has been fixed.

`fillMediaBag` has been modified so that it doesn't modify
image paths (that was part of the problem in #7345).

We now do path normalization (e.g. `\` separators on Windows) only
in writing the media; the paths are left unchanged in the image
links (sensibly, since they might be URLs and not file paths).

These changes should restore the original behavior from before 2.14.

Closes #7345.
---
 MANUAL.txt                           | 12 +++++------
 src/Text/Pandoc/Class/IO.hs          | 41 ++++++++++++++++++------------------
 src/Text/Pandoc/Class/PandocMonad.hs | 17 ++++++---------
 src/Text/Pandoc/MediaBag.hs          | 25 ++++++++++++----------
 4 files changed, 47 insertions(+), 48 deletions(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/MANUAL.txt b/MANUAL.txt
index b3a1f95e2..ef569433a 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -675,12 +675,12 @@ header when requesting a document from a URL:
 :   Extract images and other media contained in or linked from
     the source document to the path *DIR*, creating it if
     necessary, and adjust the images references in the document
-    so they point to the extracted files.  If the source format is
-    a binary container (docx, epub, or odt), the media is
-    extracted from the container and the original
-    filenames are used. Otherwise the media is read from the
-    file system or downloaded, and new filenames are constructed
-    based on SHA1 hashes of the contents.
+    so they point to the extracted files.  Media are downloaded,
+    read from the file system, or extracted from a binary
+    container (e.g. docx), as needed.  The original file paths
+    are used if they are relative paths not containing `..`.
+    Otherwise filenames are constructed from the SHA1 hash of
+    the contents.
 
 `--abbreviations=`*FILE*
 
diff --git a/src/Text/Pandoc/Class/IO.hs b/src/Text/Pandoc/Class/IO.hs
index 6df39d4d0..169074860 100644
--- a/src/Text/Pandoc/Class/IO.hs
+++ b/src/Text/Pandoc/Class/IO.hs
@@ -62,7 +62,7 @@ import Text.Pandoc.Definition (Pandoc, Inline (Image))
 import Text.Pandoc.Error (PandocError (..))
 import Text.Pandoc.Logging (LogMessage (..), messageVerbosity, showLogMessage)
 import Text.Pandoc.MIME (MimeType)
-import Text.Pandoc.MediaBag (MediaBag, MediaItem(..), lookupMedia, mediaDirectory)
+import Text.Pandoc.MediaBag (MediaBag, MediaItem(..), lookupMedia, mediaItems)
 import Text.Pandoc.Walk (walk)
 import qualified Control.Exception as E
 import qualified Data.ByteString as B
@@ -200,31 +200,32 @@ alertIndent (l:ls) = do
 extractMedia :: (PandocMonad m, MonadIO m) => FilePath -> Pandoc -> m Pandoc
 extractMedia dir d = do
   media <- getMediaBag
-  case [fp | (fp, _, _) <- mediaDirectory media] of
-    []  -> return d
-    fps -> do
-      mapM_ (writeMedia dir media) fps
-      return $ walk (adjustImagePath dir fps) d
+  let items = mediaItems media
+  if null items
+    then return d
+    else do
+      mapM_ (writeMedia dir) items
+      return $ walk (adjustImagePath dir media) d
 
 -- | Write the contents of a media bag to a path.
 writeMedia :: (PandocMonad m, MonadIO m)
-           => FilePath -> MediaBag -> FilePath
+           => FilePath
+           -> (FilePath, MimeType, BL.ByteString)
            -> m ()
-writeMedia dir mediabag subpath = do
-  let mbcontents = lookupMedia subpath mediabag
-  case mbcontents of
-       Nothing -> throwError $ PandocResourceNotFound $ pack subpath
-       Just item -> do
-         -- we normalize to get proper path separators for the platform
-         let fullpath = dir </> normalise (mediaPath item)
-         liftIOError (createDirectoryIfMissing True) (takeDirectory fullpath)
-         logIOError $ BL.writeFile fullpath $ mediaContents item
+writeMedia dir (fp, _mt, bs) = do
+  -- we normalize to get proper path separators for the platform
+  let fullpath = normalise $ dir </> fp
+  liftIOError (createDirectoryIfMissing True) (takeDirectory fullpath)
+  logIOError $ BL.writeFile fullpath bs
 
 -- | If the given Inline element is an image with a @src@ path equal to
 -- one in the list of @paths@, then prepends @dir@ to the image source;
 -- returns the element unchanged otherwise.
-adjustImagePath :: FilePath -> [FilePath] -> Inline -> Inline
-adjustImagePath dir paths (Image attr lab (src, tit))
-   | unpack src `elem` paths
-     = Image attr lab (pack (normalise $ dir </> unpack src), tit)
+adjustImagePath :: FilePath -> MediaBag -> Inline -> Inline
+adjustImagePath dir mediabag (Image attr lab (src, tit)) =
+  case lookupMedia (T.unpack src) mediabag of
+    Nothing -> Image attr lab (src, tit)
+    Just item ->
+      let fullpath = dir </> mediaPath item
+      in  Image attr lab (T.pack fullpath, tit)
 adjustImagePath _ _ x = x
diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index 4eb80df29..439aec071 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -638,17 +638,12 @@ fillMediaBag d = walkM handleImage d
         handleImage (Image attr lab (src, tit)) = catchError
           (do mediabag <- getMediaBag
               let fp = T.unpack src
-              src' <- T.pack <$> case lookupMedia fp mediabag of
-                        Just item -> return $ mediaPath item
-                        Nothing -> do
-                          (bs, mt) <- fetchItem src
-                          insertMedia fp mt (BL.fromStrict bs)
-                          mediabag' <- getMediaBag
-                          case lookupMedia fp mediabag' of
-                             Just item -> return $ mediaPath item
-                             Nothing -> throwError $ PandocSomeError $
-                               src <> " not successfully inserted into MediaBag"
-              return $ Image attr lab (src', tit))
+              case lookupMedia fp mediabag of
+                Just _ -> return ()
+                Nothing -> do
+                  (bs, mt) <- fetchItem src
+                  insertMedia fp mt (BL.fromStrict bs)
+              return $ Image attr lab (src, tit))
           (\e ->
               case e of
                 PandocResourceNotFound _ -> do
diff --git a/src/Text/Pandoc/MediaBag.hs b/src/Text/Pandoc/MediaBag.hs
index a65f315fc..06fba5632 100644
--- a/src/Text/Pandoc/MediaBag.hs
+++ b/src/Text/Pandoc/MediaBag.hs
@@ -71,16 +71,21 @@ insertMedia :: FilePath       -- ^ relative path and canonical name of resource
             -> MediaBag
             -> MediaBag
 insertMedia fp mbMime contents (MediaBag mediamap) =
-  MediaBag (M.insert (canonicalize fp) mediaItem mediamap)
-  where mediaItem = MediaItem{ mediaPath = showDigest (sha1 contents) <>
-                                 "." <> ext
+  MediaBag (M.insert fp' mediaItem mediamap)
+  where mediaItem = MediaItem{ mediaPath = newpath
                              , mediaContents = contents
                              , mediaMimeType = mt }
+        fp' = canonicalize fp
+        newpath = if isRelative fp && ".." `notElem` splitPath fp
+                     then T.unpack fp'
+                     else showDigest (sha1 contents) <> "." <> ext
         fallback = case takeExtension fp of
                         ".gz" -> getMimeTypeDef $ dropExtension fp
                         _     -> getMimeTypeDef fp
         mt = fromMaybe fallback mbMime
-        ext = maybe (takeExtension fp) T.unpack $ extensionFromMimeType mt
+        ext = case takeExtension fp of
+               '.':e  -> e
+               _      -> maybe "" T.unpack $ extensionFromMimeType mt
 
 
 -- | Lookup a media item in a 'MediaBag', returning mime type and contents.
@@ -92,13 +97,11 @@ lookupMedia fp (MediaBag mediamap) = M.lookup (canonicalize fp) mediamap
 -- | Get a list of the file paths stored in a 'MediaBag', with
 -- their corresponding mime types and the lengths in bytes of the contents.
 mediaDirectory :: MediaBag -> [(FilePath, MimeType, Int)]
-mediaDirectory (MediaBag mediamap) =
-  M.foldrWithKey (\fp item ->
-      ((T.unpack fp, mediaMimeType item,
-        fromIntegral (BL.length (mediaContents item))):)) [] mediamap
+mediaDirectory mediabag =
+  map (\(fp, mt, bs) -> (fp, mt, fromIntegral (BL.length bs)))
+    (mediaItems mediabag)
 
 mediaItems :: MediaBag -> [(FilePath, MimeType, BL.ByteString)]
 mediaItems (MediaBag mediamap) =
-  M.foldrWithKey (\fp item ->
-      ((T.unpack fp, mediaMimeType item, mediaContents item):))
-     [] mediamap
+  map (\item -> (mediaPath item, mediaMimeType item, mediaContents item))
+      (M.elems mediamap)
-- 
cgit v1.2.3


From 477a67061f06827b7e807319404cc277a417e9d0 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Fri, 9 Jul 2021 14:14:19 -0700
Subject: Always use / when adding directory to image path with extractMedia.

Even on Windows.

May help with #7431.
---
 src/Text/Pandoc/Class/IO.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/Text/Pandoc/Class')

diff --git a/src/Text/Pandoc/Class/IO.hs b/src/Text/Pandoc/Class/IO.hs
index 169074860..f4cfc8682 100644
--- a/src/Text/Pandoc/Class/IO.hs
+++ b/src/Text/Pandoc/Class/IO.hs
@@ -226,6 +226,6 @@ adjustImagePath dir mediabag (Image attr lab (src, tit)) =
   case lookupMedia (T.unpack src) mediabag of
     Nothing -> Image attr lab (src, tit)
     Just item ->
-      let fullpath = dir </> mediaPath item
+      let fullpath = dir <> "/" <> mediaPath item
       in  Image attr lab (T.pack fullpath, tit)
 adjustImagePath _ _ x = x
-- 
cgit v1.2.3