From 4f3434586743afb69f00ca91fe6ec9b68b39ae7e Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Fri, 8 Jan 2021 18:38:20 +0100 Subject: Update copyright notices for 2021 (#7012) --- src/Text/Pandoc/Readers/Docx/Combine.hs | 2 +- src/Text/Pandoc/Readers/Docx/Util.hs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/src/Text/Pandoc/Readers/Docx/Combine.hs b/src/Text/Pandoc/Readers/Docx/Combine.hs index 46112af19..bcf26c4a3 100644 --- a/src/Text/Pandoc/Readers/Docx/Combine.hs +++ b/src/Text/Pandoc/Readers/Docx/Combine.hs @@ -2,7 +2,7 @@ {- | Module : Text.Pandoc.Readers.Docx.Combine Copyright : © 2014-2020 Jesse Rosenthal , - 2014-2020 John MacFarlane , + 2014-2021 John MacFarlane , 2020 Nikolay Yakimov License : GNU GPL, version 2 or above diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index a573344ff..f9c9a8e26 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -1,7 +1,7 @@ {- | Module : Text.Pandoc.Readers.Docx.StyleMaps Copyright : © 2014-2020 Jesse Rosenthal , - 2014-2020 John MacFarlane , + 2014-2021 John MacFarlane , 2015 Nikolay Yakimov License : GNU GPL, version 2 or above -- cgit v1.2.3 From 8ca191604dcd13af27c11d2da225da646ebce6fc Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 8 Feb 2021 23:35:19 -0800 Subject: Add new unexported module T.P.XMLParser. This exports functions that uses xml-conduit's parser to produce an xml-light Element or [Content]. This allows existing pandoc code to use a better parser without much modification. The new parser is used in all places where xml-light's parser was previously used. Benchmarks show a significant performance improvement in parsing XML-based formats (especially ODT and FB2). Note that the xml-light types use String, so the conversion from xml-conduit types involves a lot of extra allocation. It would be desirable to avoid that in the future by gradually switching to using xml-conduit directly. This can be done module by module. The new parser also reports errors, which we report when possible. A new constructor PandocXMLError has been added to PandocError in T.P.Error [API change]. Closes #7091, which was the main stimulus. These changes revealed the need for some changes in the tests. The docbook-reader.docbook test lacked definitions for the entities it used; these have been added. And the docx golden tests have been updated, because the new parser does not preserve the order of attributes. Add entity defs to docbook-reader.docbook. Update golden tests for docx. --- MANUAL.txt | 1 + pandoc.cabal | 2 + src/Text/Pandoc/Error.hs | 3 + src/Text/Pandoc/ImageSize.hs | 5 +- src/Text/Pandoc/Readers/DocBook.hs | 52 ++++++++++------ src/Text/Pandoc/Readers/Docx/Parse.hs | 21 ++++--- src/Text/Pandoc/Readers/Docx/Parse/Styles.hs | 28 +++++---- src/Text/Pandoc/Readers/EPUB.hs | 17 ++++-- src/Text/Pandoc/Readers/FB2.hs | 10 ++-- src/Text/Pandoc/Readers/JATS.hs | 9 ++- src/Text/Pandoc/Readers/OPML.hs | 10 +++- src/Text/Pandoc/Readers/Odt.hs | 24 ++++---- src/Text/Pandoc/Writers/EPUB.hs | 13 +++- src/Text/Pandoc/Writers/FB2.hs | 11 +++- src/Text/Pandoc/Writers/ODT.hs | 29 +++++---- src/Text/Pandoc/Writers/OOXML.hs | 9 +-- src/Text/Pandoc/Writers/Powerpoint/Output.hs | 4 +- src/Text/Pandoc/XMLParser.hs | 66 +++++++++++++++++++++ test/Tests/Readers/JATS.hs | 1 + test/command/5321.md | 4 +- test/docbook-reader.docbook | 7 ++- test/docx/golden/block_quotes.docx | Bin 10092 -> 10098 bytes test/docx/golden/codeblock.docx | Bin 9944 -> 9950 bytes test/docx/golden/comments.docx | Bin 10279 -> 10285 bytes test/docx/golden/custom_style_no_reference.docx | Bin 10042 -> 10048 bytes test/docx/golden/custom_style_preserve.docx | Bin 10666 -> 10673 bytes test/docx/golden/custom_style_reference.docx | Bin 12434 -> 12434 bytes test/docx/golden/definition_list.docx | Bin 9941 -> 9947 bytes .../golden/document-properties-short-desc.docx | Bin 9947 -> 9953 bytes test/docx/golden/document-properties.docx | Bin 10423 -> 10429 bytes test/docx/golden/headers.docx | Bin 10080 -> 10086 bytes test/docx/golden/image.docx | Bin 26758 -> 26764 bytes test/docx/golden/inline_code.docx | Bin 9880 -> 9886 bytes test/docx/golden/inline_formatting.docx | Bin 10060 -> 10066 bytes test/docx/golden/inline_images.docx | Bin 26816 -> 26822 bytes test/docx/golden/link_in_notes.docx | Bin 10101 -> 10107 bytes test/docx/golden/links.docx | Bin 10276 -> 10282 bytes test/docx/golden/lists.docx | Bin 10352 -> 10358 bytes test/docx/golden/lists_continuing.docx | Bin 10143 -> 10149 bytes test/docx/golden/lists_multiple_initial.docx | Bin 10232 -> 10238 bytes test/docx/golden/lists_restarting.docx | Bin 10144 -> 10150 bytes test/docx/golden/nested_anchors_in_header.docx | Bin 10239 -> 10245 bytes test/docx/golden/notes.docx | Bin 10046 -> 10052 bytes test/docx/golden/raw-blocks.docx | Bin 9980 -> 9986 bytes test/docx/golden/raw-bookmarks.docx | Bin 10115 -> 10121 bytes test/docx/golden/table_one_row.docx | Bin 9932 -> 9938 bytes test/docx/golden/table_with_list_cell.docx | Bin 10249 -> 10255 bytes test/docx/golden/tables.docx | Bin 10266 -> 10272 bytes test/docx/golden/track_changes_deletion.docx | Bin 9924 -> 9930 bytes test/docx/golden/track_changes_insertion.docx | Bin 9907 -> 9913 bytes test/docx/golden/track_changes_move.docx | Bin 9941 -> 9947 bytes .../golden/track_changes_scrubbed_metadata.docx | Bin 10053 -> 10059 bytes test/docx/golden/unicode.docx | Bin 9865 -> 9871 bytes test/docx/golden/verbatim_subsuper.docx | Bin 9913 -> 9919 bytes test/jats-reader.native | 2 +- test/jats-reader.xml | 1 + test/pptx/code-custom.pptx | Bin 28230 -> 28221 bytes test/pptx/code-custom_templated.pptx | Bin 395524 -> 395516 bytes test/pptx/code.pptx | Bin 28229 -> 28220 bytes test/pptx/code_templated.pptx | Bin 395522 -> 395514 bytes test/pptx/document-properties-short-desc.pptx | Bin 27012 -> 27004 bytes .../document-properties-short-desc_templated.pptx | Bin 394298 -> 394288 bytes test/pptx/document-properties.pptx | Bin 27417 -> 27408 bytes test/pptx/document-properties_templated.pptx | Bin 394701 -> 394691 bytes test/pptx/endnotes.pptx | Bin 26969 -> 26962 bytes test/pptx/endnotes_templated.pptx | Bin 394262 -> 394253 bytes test/pptx/endnotes_toc.pptx | Bin 27892 -> 27789 bytes test/pptx/endnotes_toc_templated.pptx | Bin 395186 -> 395083 bytes test/pptx/images.pptx | Bin 44626 -> 44619 bytes test/pptx/images_templated.pptx | Bin 411916 -> 411909 bytes test/pptx/inline_formatting.pptx | Bin 26156 -> 26148 bytes test/pptx/inline_formatting_templated.pptx | Bin 393447 -> 393438 bytes test/pptx/lists.pptx | Bin 27056 -> 27049 bytes test/pptx/lists_templated.pptx | Bin 394349 -> 394340 bytes test/pptx/raw_ooxml.pptx | Bin 26948 -> 26940 bytes test/pptx/raw_ooxml_templated.pptx | Bin 394240 -> 394231 bytes test/pptx/remove_empty_slides.pptx | Bin 44073 -> 44065 bytes test/pptx/remove_empty_slides_templated.pptx | Bin 411359 -> 411352 bytes test/pptx/slide_breaks.pptx | Bin 28582 -> 28575 bytes test/pptx/slide_breaks_slide_level_1.pptx | Bin 27751 -> 27744 bytes .../pptx/slide_breaks_slide_level_1_templated.pptx | Bin 395045 -> 395038 bytes test/pptx/slide_breaks_templated.pptx | Bin 395875 -> 395868 bytes test/pptx/slide_breaks_toc.pptx | Bin 29539 -> 29532 bytes test/pptx/slide_breaks_toc_templated.pptx | Bin 396833 -> 396826 bytes test/pptx/speaker_notes.pptx | Bin 35444 -> 35436 bytes test/pptx/speaker_notes_after_metadata.pptx | Bin 31683 -> 31675 bytes .../speaker_notes_after_metadata_templated.pptx | Bin 398964 -> 398955 bytes test/pptx/speaker_notes_afterheader.pptx | Bin 30700 -> 30691 bytes test/pptx/speaker_notes_afterheader_templated.pptx | Bin 397988 -> 397979 bytes test/pptx/speaker_notes_afterseps.pptx | Bin 51612 -> 51604 bytes test/pptx/speaker_notes_afterseps_templated.pptx | Bin 418903 -> 418896 bytes test/pptx/speaker_notes_templated.pptx | Bin 402736 -> 402728 bytes test/pptx/start_numbering_at.pptx | Bin 27031 -> 27023 bytes test/pptx/start_numbering_at_templated.pptx | Bin 394323 -> 394314 bytes test/pptx/tables.pptx | Bin 27573 -> 27566 bytes test/pptx/tables_templated.pptx | Bin 394868 -> 394859 bytes test/pptx/two_column.pptx | Bin 26075 -> 26065 bytes test/pptx/two_column_templated.pptx | Bin 393366 -> 393355 bytes 98 files changed, 238 insertions(+), 91 deletions(-) create mode 100644 src/Text/Pandoc/XMLParser.hs (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/MANUAL.txt b/MANUAL.txt index 5b90d039a..dc3b4ca77 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -1464,6 +1464,7 @@ Nonzero exit codes have the following meanings: 24 PandocCiteprocError 31 PandocEpubSubdirectoryError 43 PandocPDFError + 44 PandocXMLError 47 PandocPDFProgramNotFoundError 61 PandocHttpError 62 PandocShouldNeverHappenError diff --git a/pandoc.cabal b/pandoc.cabal index 72e7c2da5..e56456c68 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -493,6 +493,7 @@ library unicode-transforms >= 0.3 && < 0.4, unordered-containers >= 0.2 && < 0.3, xml >= 1.3.12 && < 1.4, + xml-conduit >= 1.7 && < 1.10, zip-archive >= 0.2.3.4 && < 0.5, zlib >= 0.5 && < 0.7 if os(windows) && arch(i386) @@ -686,6 +687,7 @@ library Text.Pandoc.Lua.PandocLua, Text.Pandoc.Lua.Util, Text.Pandoc.Lua.Walk, + Text.Pandoc.XMLParser, Text.Pandoc.CSS, Text.Pandoc.CSV, Text.Pandoc.RoffChar, diff --git a/src/Text/Pandoc/Error.hs b/src/Text/Pandoc/Error.hs index 204cf15ca..831405f42 100644 --- a/src/Text/Pandoc/Error.hs +++ b/src/Text/Pandoc/Error.hs @@ -48,6 +48,7 @@ data PandocError = PandocIOError Text IOError | PandocFailOnWarningError | PandocPDFProgramNotFoundError Text | PandocPDFError Text + | PandocXMLError Text Text | PandocFilterError Text Text | PandocLuaError Text | PandocCouldNotFindDataFileError Text @@ -103,6 +104,8 @@ handleError (Left e) = PandocPDFProgramNotFoundError pdfprog -> err 47 $ pdfprog <> " not found. Please select a different --pdf-engine or install " <> pdfprog PandocPDFError logmsg -> err 43 $ "Error producing PDF.\n" <> logmsg + PandocXMLError fp logmsg -> err 44 $ "Invalid XML" <> + (if T.null fp then "" else " in " <> fp) <> ":\n" <> logmsg PandocFilterError filtername msg -> err 83 $ "Error running filter " <> filtername <> ":\n" <> msg PandocLuaError msg -> err 84 $ "Error running Lua:\n" <> msg diff --git a/src/Text/Pandoc/ImageSize.hs b/src/Text/Pandoc/ImageSize.hs index e19958f6a..e0a1af8e8 100644 --- a/src/Text/Pandoc/ImageSize.hs +++ b/src/Text/Pandoc/ImageSize.hs @@ -45,7 +45,9 @@ import Text.Pandoc.Definition import Text.Pandoc.Options import qualified Text.Pandoc.UTF8 as UTF8 import qualified Text.XML.Light as Xml +import Text.Pandoc.XMLParser (parseXMLElement) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL import qualified Data.Text.Encoding as TE import Control.Applicative import qualified Data.Attoparsec.ByteString.Char8 as A @@ -327,7 +329,8 @@ getSize img = svgSize :: WriterOptions -> ByteString -> Maybe ImageSize svgSize opts img = do - doc <- Xml.parseXMLDoc $ UTF8.toString img + doc <- either (const mzero) return $ parseXMLElement + $ TL.fromStrict $ UTF8.toText img let viewboxSize = do vb <- Xml.findAttrBy (== Xml.QName "viewBox" Nothing Nothing) doc [_,_,w,h] <- mapM safeRead (T.words (T.pack vb)) diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index ada3e98ec..ad0108843 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -12,7 +12,7 @@ Conversion of DocBook XML to 'Pandoc' document. -} module Text.Pandoc.Readers.DocBook ( readDocBook ) where import Control.Monad.State.Strict -import Data.Char (isSpace, toUpper) +import Data.Char (isSpace, toUpper, isLetter) import Data.Default import Data.Either (rights) import Data.Foldable (asum) @@ -21,7 +21,10 @@ import Data.List (intersperse,elemIndex) import Data.Maybe (fromMaybe,mapMaybe) import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL +import Control.Monad.Except (throwError) import Text.HTML.TagSoup.Entity (lookupEntity) +import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad, report) import Text.Pandoc.Options @@ -29,6 +32,7 @@ import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces) import Text.TeXMath (readMathML, writeTeX) import Text.XML.Light +import Text.Pandoc.XMLParser (parseXMLContents) {- @@ -537,22 +541,25 @@ instance Default DBState where readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc readDocBook _ inp = do - let tree = normalizeTree . parseXML . handleInstructions $ crFilter inp + tree <- either (throwError . PandocXMLError "") (return . normalizeTree) $ + parseXMLContents (TL.fromStrict . handleInstructions $ crFilter inp) (bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree return $ Pandoc (dbMeta st') (toList . mconcat $ bs) --- We treat specially (issue #1236), converting it --- to
, since xml-light doesn't parse the instruction correctly. --- Other xml instructions are simply removed from the input stream. +-- We treat certain processing instructions by converting them to tags +-- beginning "pi-". handleInstructions :: Text -> Text -handleInstructions = T.pack . handleInstructions' . T.unpack - -handleInstructions' :: String -> String -handleInstructions' ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions' xs -handleInstructions' xs = case break (=='<') xs of - (ys, []) -> ys - ([], '<':zs) -> '<' : handleInstructions' zs - (ys, zs) -> ys ++ handleInstructions' zs +handleInstructions t = + let (x,y) = T.breakOn "" y + in (if T.takeWhile (\c -> isLetter c || c == '-') + (T.drop 2 w) `elem` ["asciidoc-br", "dbfo"] + then x <> " T.drop 2 w <> "/>" + else x <> w <> T.take 2 z) <> + handleInstructions (T.drop 2 z) getFigure :: PandocMonad m => Element -> DB m Blocks getFigure e = do @@ -892,7 +899,11 @@ parseBlock (Elem e) = "subtitle" -> return mempty -- handled in parent element _ -> skip >> getBlocks e where skip = do - lift $ report $ IgnoredElement $ T.pack $ qName (elName e) + let qn = T.pack $ qName $ elName e + let name = if "pi-" `T.isPrefixOf` qn + then " qn <> "?>" + else qn + lift $ report $ IgnoredElement name return mempty codeBlockWithLang = do @@ -964,7 +975,7 @@ parseBlock (Elem e) = cs -> map toAlignment cs let parseWidth s = safeRead (T.filter (\x -> (x >= '0' && x <= '9') || x == '.') s) - let textWidth = case filterChild (named "?dbfo") e of + let textWidth = case filterChild (named "pi-dbfo") e of Just d -> case attrValue "table-width" d of "" -> 1.0 w -> fromMaybe 100.0 (parseWidth w) / 100.0 @@ -1165,12 +1176,15 @@ parseInline (Elem e) = "title" -> return mempty "affiliation" -> skip -- Note: this isn't a real docbook tag; it's what we convert - -- to in handleInstructions, above. A kludge to - -- work around xml-light's inability to parse an instruction. - "br" -> return linebreak + -- to in handleInstructions, above. + "pi-asciidoc-br" -> return linebreak _ -> skip >> innerInlines id where skip = do - lift $ report $ IgnoredElement $ T.pack $ qName (elName e) + let qn = T.pack $ qName $ elName e + let name = if "pi-" `T.isPrefixOf` qn + then " qn <> "?>" + else qn + lift $ report $ IgnoredElement name return mempty innerInlines f = extractSpaces f . mconcat <$> diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index fdcffcc3f..056dab6c2 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -74,6 +74,7 @@ import Text.TeXMath.Readers.OMML (readOMML) import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, textToFont) import Text.XML.Light import qualified Text.XML.Light.Cursor as XMLC +import Text.Pandoc.XMLParser (parseXMLElement) data ReaderEnv = ReaderEnv { envNotes :: Notes , envComments :: Comments @@ -343,10 +344,16 @@ archiveToDocxWithWarnings archive = do Right doc -> Right (Docx doc, stateWarnings st) Left e -> Left e +parseXMLFromEntry :: Entry -> Maybe Element +parseXMLFromEntry entry = + case parseXMLElement (UTF8.toTextLazy (fromEntry entry)) of + Left _ -> Nothing + Right el -> Just el + getDocumentXmlPath :: Archive -> Maybe FilePath getDocumentXmlPath zf = do entry <- findEntryByPath "_rels/.rels" zf - relsElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + relsElem <- parseXMLFromEntry entry let rels = filterChildrenName (\n -> qName n == "Relationship") relsElem rel <- find (\e -> findAttr (QName "Type" Nothing Nothing) e == Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument") @@ -362,7 +369,7 @@ archiveToDocument :: Archive -> D Document archiveToDocument zf = do docPath <- asks envDocXmlPath entry <- maybeToD $ findEntryByPath docPath zf - docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + docElem <- maybeToD $ parseXMLFromEntry entry let namespaces = elemToNameSpaces docElem bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem) @@ -401,9 +408,9 @@ constructBogusParStyleData stName = ParStyle archiveToNotes :: Archive -> Notes archiveToNotes zf = let fnElem = findEntryByPath "word/footnotes.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + >>= parseXMLFromEntry enElem = findEntryByPath "word/endnotes.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + >>= parseXMLFromEntry fn_namespaces = maybe [] elemToNameSpaces fnElem en_namespaces = maybe [] elemToNameSpaces enElem ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces @@ -415,7 +422,7 @@ archiveToNotes zf = archiveToComments :: Archive -> Comments archiveToComments zf = let cmtsElem = findEntryByPath "word/comments.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + >>= parseXMLFromEntry cmts_namespaces = maybe [] elemToNameSpaces cmtsElem cmts = elemToComments cmts_namespaces <$> (cmtsElem >>= walkDocument cmts_namespaces) in @@ -445,7 +452,7 @@ filePathToRelationships :: Archive -> FilePath -> FilePath -> [Relationship] filePathToRelationships ar docXmlPath fp | Just relType <- filePathToRelType fp docXmlPath , Just entry <- findEntryByPath fp ar - , Just relElems <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry = + , Just relElems <- parseXMLFromEntry entry = mapMaybe (relElemToRelationship relType) $ elChildren relElems filePathToRelationships _ _ _ = [] @@ -527,7 +534,7 @@ archiveToNumbering' zf = case findEntryByPath "word/numbering.xml" zf of Nothing -> Just $ Numbering [] [] [] Just entry -> do - numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + numberingElem <- parseXMLFromEntry entry let namespaces = elemToNameSpaces numberingElem numElems = findChildrenByName namespaces "w" "num" numberingElem absNumElems = findChildrenByName namespaces "w" "abstractNum" numberingElem diff --git a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs index 236167187..edade8654 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs @@ -53,6 +53,7 @@ import Data.Coerce import Text.Pandoc.Readers.Docx.Util import qualified Text.Pandoc.UTF8 as UTF8 import Text.XML.Light +import Text.Pandoc.XMLParser (parseXMLElement) newtype CharStyleId = CharStyleId T.Text deriving (Show, Eq, Ord, IsString, FromStyleId) @@ -135,19 +136,22 @@ defaultRunStyle = RunStyle { isBold = Nothing , rParentStyle = Nothing } -archiveToStyles' :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) => - (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2) +archiveToStyles' + :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) + => (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2) archiveToStyles' conv1 conv2 zf = - let stylesElem = findEntryByPath "word/styles.xml" zf >>= - (parseXMLDoc . UTF8.toStringLazy . fromEntry) - in - case stylesElem of - Nothing -> (M.empty, M.empty) - Just styElem -> - let namespaces = elemToNameSpaces styElem - in - ( M.fromList $ map (\r -> (conv1 r, r)) $ buildBasedOnList namespaces styElem Nothing, - M.fromList $ map (\p -> (conv2 p, p)) $ buildBasedOnList namespaces styElem Nothing) + case findEntryByPath "word/styles.xml" zf of + Nothing -> (M.empty, M.empty) + Just entry -> + case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of + Left _ -> (M.empty, M.empty) + Right styElem -> + let namespaces = elemToNameSpaces styElem + in + ( M.fromList $ map (\r -> (conv1 r, r)) $ + buildBasedOnList namespaces styElem Nothing, + M.fromList $ map (\p -> (conv2 p, p)) $ + buildBasedOnList namespaces styElem Nothing) isBasedOnStyle :: (ElemToStyle a, FromStyleId (StyleId a)) => NameSpaces -> Element -> Maybe a -> Bool isBasedOnStyle ns element parentStyle diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 5e3326e6d..369c4f0c9 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -17,7 +17,7 @@ module Text.Pandoc.Readers.EPUB (readEPUB) where -import Codec.Archive.Zip (Archive (..), Entry, findEntryByPath, fromEntry, +import Codec.Archive.Zip (Archive (..), Entry(..), findEntryByPath, fromEntry, toArchiveOrFail) import Control.DeepSeq (NFData, deepseq) import Control.Monad (guard, liftM, liftM2, mplus) @@ -41,9 +41,10 @@ import Text.Pandoc.MIME (MimeType) import Text.Pandoc.Options (ReaderOptions (..)) import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI) -import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) +import qualified Text.Pandoc.UTF8 as UTF8 (toTextLazy) import Text.Pandoc.Walk (query, walk) import Text.XML.Light +import Text.Pandoc.XMLParser (parseXMLElement) type Items = M.Map String (FilePath, MimeType) @@ -181,7 +182,7 @@ renameMeta s = T.pack s getManifest :: PandocMonad m => Archive -> m (String, Element) getManifest archive = do metaEntry <- findEntryByPathE ("META-INF" "container.xml") archive - docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry + docElem <- parseXMLDocE metaEntry let namespaces = mapMaybe attrToNSPair (elAttribs docElem) ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) as <- fmap (map attrToPair . elAttribs) @@ -190,7 +191,7 @@ getManifest archive = do let rootdir = dropFileName manifestFile --mime <- lookup "media-type" as manifest <- findEntryByPathE manifestFile archive - (rootdir,) <$> (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) + (rootdir,) <$> parseXMLDocE manifest -- Fixup @@ -284,8 +285,12 @@ findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry findEntryByPathE (normalise . unEscapeString -> path) a = mkE ("No entry on path: " ++ path) $ findEntryByPath path a -parseXMLDocE :: PandocMonad m => String -> m Element -parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc +parseXMLDocE :: PandocMonad m => Entry -> m Element +parseXMLDocE entry = + either (throwError . PandocXMLError fp) return $ parseXMLElement doc + where + doc = UTF8.toTextLazy . fromEntry $ entry + fp = T.pack $ eRelativePath entry findElementE :: PandocMonad m => QName -> Element -> m Element findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x diff --git a/src/Text/Pandoc/Readers/FB2.hs b/src/Text/Pandoc/Readers/FB2.hs index b0d2f092b..b804eab4f 100644 --- a/src/Text/Pandoc/Readers/FB2.hs +++ b/src/Text/Pandoc/Readers/FB2.hs @@ -32,6 +32,7 @@ import Data.List (intersperse) import qualified Data.Map as M import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL import Data.Default import Data.Maybe import Text.HTML.TagSoup.Entity (lookupEntity) @@ -42,6 +43,7 @@ import Text.Pandoc.Logging import Text.Pandoc.Options import Text.Pandoc.Shared (crFilter) import Text.XML.Light +import Text.Pandoc.XMLParser (parseXMLElement) type FB2 m = StateT FB2State m @@ -64,10 +66,10 @@ instance HasMeta FB2State where readFB2 :: PandocMonad m => ReaderOptions -> Text -> m Pandoc readFB2 _ inp = - case parseXMLDoc $ crFilter inp of - Nothing -> throwError $ PandocParseError "Not an XML document" - Just e -> do - (bs, st) <- runStateT (parseRootElement e) def + case parseXMLElement $ TL.fromStrict $ crFilter inp of + Left msg -> throwError $ PandocXMLError "" msg + Right el -> do + (bs, st) <- runStateT (parseRootElement el) def let authors = if null $ fb2Authors st then id else setMeta "author" (map text $ reverse $ fb2Authors st) diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs index c638da519..dfd343b7a 100644 --- a/src/Text/Pandoc/Readers/JATS.hs +++ b/src/Text/Pandoc/Readers/JATS.hs @@ -14,6 +14,8 @@ Conversion of JATS XML to 'Pandoc' document. module Text.Pandoc.Readers.JATS ( readJATS ) where import Control.Monad.State.Strict +import Control.Monad.Except (throwError) +import Text.Pandoc.Error (PandocError(..)) import Data.Char (isDigit, isSpace, toUpper) import Data.Default import Data.Generics @@ -22,6 +24,7 @@ import qualified Data.Map as Map import Data.Maybe (maybeToList, fromMaybe) import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL import Text.HTML.TagSoup.Entity (lookupEntity) import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad) @@ -29,6 +32,7 @@ import Text.Pandoc.Options import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces) import Text.TeXMath (readMathML, writeTeX) import Text.XML.Light +import Text.Pandoc.XMLParser (parseXMLContents) import qualified Data.Set as S (fromList, member) import Data.Set ((\\)) @@ -51,8 +55,9 @@ instance Default JATSState where readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc readJATS _ inp = do - let tree = normalizeTree . parseXML - $ T.unpack $ crFilter inp + tree <- either (throwError . PandocXMLError "") + (return . normalizeTree) $ + parseXMLContents (TL.fromStrict $ crFilter inp) (bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree return $ Pandoc (jatsMeta st') (toList . mconcat $ bs) diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs index 5b8996025..bdadc4dd9 100644 --- a/src/Text/Pandoc/Readers/OPML.hs +++ b/src/Text/Pandoc/Readers/OPML.hs @@ -19,14 +19,18 @@ import Data.Generics import Data.Maybe (fromMaybe) import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL import Text.HTML.TagSoup.Entity (lookupEntity) import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Options +import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Readers.Markdown (readMarkdown) import Text.Pandoc.Shared (crFilter, blocksToInlines') import Text.XML.Light +import Text.Pandoc.XMLParser (parseXMLContents) +import Control.Monad.Except (throwError) type OPML m = StateT OPMLState m @@ -49,8 +53,10 @@ instance Default OPMLState where readOPML :: PandocMonad m => ReaderOptions -> Text -> m Pandoc readOPML opts inp = do (bs, st') <- runStateT - (mapM parseBlock $ normalizeTree $ - parseXML (T.unpack (crFilter inp))) def{ opmlOptions = opts } + (case parseXMLContents (TL.fromStrict (crFilter inp)) of + Left msg -> throwError $ PandocXMLError "" msg + Right ns -> mapM parseBlock $ normalizeTree ns) + def{ opmlOptions = opts } return $ setTitle (opmlDocTitle st') $ setAuthors (opmlDocAuthors st') $ diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs index 9943d3147..85308deb1 100644 --- a/src/Text/Pandoc/Readers/Odt.hs +++ b/src/Text/Pandoc/Readers/Odt.hs @@ -15,6 +15,7 @@ module Text.Pandoc.Readers.Odt ( readOdt ) where import Codec.Archive.Zip import qualified Text.XML.Light as XML +import Text.Pandoc.XMLParser (parseXMLElement) import qualified Data.ByteString.Lazy as B @@ -66,18 +67,18 @@ bytesToOdt bytes = case toArchiveOrFail bytes of -- archiveToOdt :: Archive -> Either PandocError (Pandoc, MediaBag) -archiveToOdt archive = either (Left. PandocParseError) Right $ do - let onFailure msg Nothing = Left msg +archiveToOdt archive = do + let onFailure msg Nothing = Left $ PandocParseError msg onFailure _ (Just x) = Right x contentEntry <- onFailure "Could not find content.xml" (findEntryByPath "content.xml" archive) stylesEntry <- onFailure "Could not find styles.xml" (findEntryByPath "styles.xml" archive) - contentElem <- onFailure "Could not find content element" - (entryToXmlElem contentEntry) - stylesElem <- onFailure "Could not find styles element" - (entryToXmlElem stylesEntry) - styles <- either (\_ -> Left "Could not read styles") Right + contentElem <- entryToXmlElem contentEntry + stylesElem <- entryToXmlElem stylesEntry + styles <- either + (\_ -> Left $ PandocParseError "Could not read styles") + Right (chooseMax (readStylesAt stylesElem ) (readStylesAt contentElem)) let filePathIsOdtMedia :: FilePath -> Bool filePathIsOdtMedia fp = @@ -85,10 +86,13 @@ archiveToOdt archive = either (Left. PandocParseError) Right $ do in (dir == "Pictures/") || (dir /= "./" && name == "content.xml") let media = filteredFilesFromArchive archive filePathIsOdtMedia let startState = readerState styles media - either (\_ -> Left "Could not convert opendocument") Right + either (\_ -> Left $ PandocParseError "Could not convert opendocument") Right (runConverter' read_body startState contentElem) -- -entryToXmlElem :: Entry -> Maybe XML.Element -entryToXmlElem = XML.parseXMLDoc . UTF8.toStringLazy . fromEntry +entryToXmlElem :: Entry -> Either PandocError XML.Element +entryToXmlElem entry = + case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of + Right x -> Right x + Left msg -> Left $ PandocXMLError (T.pack $ eRelativePath entry) msg diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 1f16f6772..e99fa2567 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -55,8 +55,9 @@ import Text.Pandoc.Walk (query, walk, walkM) import Text.Pandoc.Writers.HTML (writeHtmlStringForEPUB) import Text.Printf (printf) import Text.XML.Light (Attr (..), Element (..), Node (..), QName (..), - add_attrs, lookupAttr, node, onlyElems, parseXML, + add_attrs, lookupAttr, node, onlyElems, ppElement, showElement, strContent, unode, unqual) +import Text.Pandoc.XMLParser (parseXMLContents) import Text.Pandoc.XML (escapeStringForXML) import Text.DocTemplates (FromContext(lookupContext), Context(..), ToContext(toVal), Val(..)) @@ -160,7 +161,12 @@ mkEntry path content = do getEPUBMetadata :: PandocMonad m => WriterOptions -> Meta -> E m EPUBMetadata getEPUBMetadata opts meta = do let md = metadataFromMeta opts meta - let elts = maybe [] (onlyElems . parseXML) $ writerEpubMetadata opts + elts <- case writerEpubMetadata opts of + Nothing -> return [] + Just t -> case parseXMLContents (TL.fromStrict t) of + Left msg -> throwError $ + PandocXMLError "epub metadata" msg + Right ns -> return (onlyElems ns) let md' = foldr addMetadataFromXML md elts let addIdentifier m = if null (epubIdentifier m) @@ -836,7 +842,8 @@ pandocToEPUB version opts doc = do : case subs of [] -> [] (_:_) -> [unode "ol" ! [("class","toc")] $ subs] - where titElements = parseXML titRendered + where titElements = either (const []) id $ + parseXMLContents (TL.fromStrict titRendered) titRendered = case P.runPure (writeHtmlStringForEPUB version opts{ writerTemplate = Nothing diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs index 25b1f28d1..9334d6e9a 100644 --- a/src/Text/Pandoc/Writers/FB2.hs +++ b/src/Text/Pandoc/Writers/FB2.hs @@ -19,7 +19,7 @@ FictionBook is an XML-based e-book format. For more information see: module Text.Pandoc.Writers.FB2 (writeFB2) where import Control.Monad (zipWithM) -import Control.Monad.Except (catchError) +import Control.Monad.Except (catchError, throwError) import Control.Monad.State.Strict (StateT, evalStateT, get, gets, lift, liftM, modify) import Data.ByteString.Base64 (encode) import Data.Char (isAscii, isControl, isSpace) @@ -27,16 +27,18 @@ import Data.Either (lefts, rights) import Data.List (intercalate) import Data.Text (Text, pack) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL import qualified Data.Text.Encoding as TE import Network.HTTP (urlEncode) import Text.XML.Light import qualified Text.XML.Light as X import qualified Text.XML.Light.Cursor as XC -import qualified Text.XML.Light.Input as XI +import Text.Pandoc.XMLParser (parseXMLContents) import Text.Pandoc.Class.PandocMonad (PandocMonad, report) import qualified Text.Pandoc.Class.PandocMonad as P import Text.Pandoc.Definition +import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Logging import Text.Pandoc.Options (HTMLMathMethod (..), WriterOptions (..), def) import Text.Pandoc.Shared (capitalize, isURI, orderedListMarkers, @@ -307,7 +309,10 @@ blockToXml (CodeBlock _ s) = return . spaceBeforeAfter . map (el "p" . el "code" . T.unpack) . T.lines $ s blockToXml (RawBlock f str) = if f == Format "fb2" - then return $ XI.parseXML str + then + case parseXMLContents (TL.fromStrict str) of + Left msg -> throwError $ PandocXMLError "" msg + Right nds -> return nds else return [] blockToXml (Div _ bs) = cMapM blockToXml bs blockToXml (BlockQuote bs) = list . el "cite" <$> cMapM blockToXml bs diff --git a/src/Text/Pandoc/Writers/ODT.hs b/src/Text/Pandoc/Writers/ODT.hs index 05dfad5eb..a32ff618c 100644 --- a/src/Text/Pandoc/Writers/ODT.hs +++ b/src/Text/Pandoc/Writers/ODT.hs @@ -13,7 +13,7 @@ Conversion of 'Pandoc' documents to ODT. -} module Text.Pandoc.Writers.ODT ( writeODT ) where import Codec.Archive.Zip -import Control.Monad.Except (catchError) +import Control.Monad.Except (catchError, throwError) import Control.Monad.State.Strict import qualified Data.ByteString.Lazy as B import Data.Generics (everywhere', mkT) @@ -27,6 +27,7 @@ import Text.Pandoc.BCP47 (Lang (..), getLang, renderLang) import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang) import qualified Text.Pandoc.Class.PandocMonad as P import Text.Pandoc.Definition +import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.ImageSize import Text.Pandoc.Logging import Text.Pandoc.MIME (extensionFromMimeType, getMimeType) @@ -35,10 +36,11 @@ import Text.DocLayout import Text.Pandoc.Shared (stringify, pandocVersion, tshow) import Text.Pandoc.Writers.Shared (lookupMetaString, lookupMetaBlocks, fixDisplayMath) -import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toStringLazy) +import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toTextLazy) import Text.Pandoc.Walk import Text.Pandoc.Writers.OpenDocument (writeOpenDocument) import Text.Pandoc.XML +import Text.Pandoc.XMLParser (parseXMLElement) import Text.TeXMath import Text.XML.Light @@ -172,17 +174,18 @@ updateStyleWithLang :: PandocMonad m => Maybe Lang -> Archive -> O m Archive updateStyleWithLang Nothing arch = return arch updateStyleWithLang (Just lang) arch = do epochtime <- floor `fmap` lift P.getPOSIXTime - return arch{ zEntries = [if eRelativePath e == "styles.xml" - then case parseXMLDoc - (toStringLazy (fromEntry e)) of - Nothing -> e - Just d -> - toEntry "styles.xml" epochtime - ( fromStringLazy - . ppTopElement - . addLang lang $ d ) - else e - | e <- zEntries arch] } + entries <- mapM (\e -> if eRelativePath e == "styles.xml" + then case parseXMLElement + (toTextLazy (fromEntry e)) of + Left msg -> throwError $ + PandocXMLError "styles.xml" msg + Right d -> return $ + toEntry "styles.xml" epochtime + ( fromStringLazy + . ppTopElement + . addLang lang $ d ) + else return e) (zEntries arch) + return arch{ zEntries = entries } addLang :: Lang -> Element -> Element addLang lang = everywhere' (mkT updateLangAttr) diff --git a/src/Text/Pandoc/Writers/OOXML.hs b/src/Text/Pandoc/Writers/OOXML.hs index 3ac007f4e..8f60e70d5 100644 --- a/src/Text/Pandoc/Writers/OOXML.hs +++ b/src/Text/Pandoc/Writers/OOXML.hs @@ -35,6 +35,7 @@ import qualified Data.Text as T import Text.Pandoc.Class.PandocMonad (PandocMonad) import qualified Text.Pandoc.UTF8 as UTF8 import Text.XML.Light as XML +import Text.Pandoc.XMLParser (parseXMLElement) mknode :: Node t => String -> [(String,String)] -> t -> Element mknode s attrs = @@ -62,10 +63,10 @@ parseXml refArchive distArchive relpath = findEntryByPath relpath distArchive of Nothing -> throwError $ PandocSomeError $ T.pack relpath <> " missing in reference file" - Just e -> case parseXMLDoc . UTF8.toStringLazy . fromEntry $ e of - Nothing -> throwError $ PandocSomeError $ - T.pack relpath <> " corrupt in reference file" - Just d -> return d + Just e -> case parseXMLElement . UTF8.toTextLazy . fromEntry $ e of + Left msg -> + throwError $ PandocXMLError (T.pack relpath) msg + Right d -> return d -- Copied from Util diff --git a/src/Text/Pandoc/Writers/Powerpoint/Output.hs b/src/Text/Pandoc/Writers/Powerpoint/Output.hs index 8554db622..cd092969b 100644 --- a/src/Text/Pandoc/Writers/Powerpoint/Output.hs +++ b/src/Text/Pandoc/Writers/Powerpoint/Output.hs @@ -29,6 +29,7 @@ import Data.Time.Clock (UTCTime) import Data.Time.Clock.POSIX (utcTimeToPOSIXSeconds, posixSecondsToUTCTime) import System.FilePath.Posix (splitDirectories, splitExtension, takeExtension) import Text.XML.Light +import Text.Pandoc.XMLParser (parseXMLElement) import Text.Pandoc.Definition import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Class.PandocMonad (PandocMonad) @@ -77,7 +78,8 @@ getPresentationSize :: Archive -> Archive -> Maybe (Integer, Integer) getPresentationSize refArchive distArchive = do entry <- findEntryByPath "ppt/presentation.xml" refArchive `mplus` findEntryByPath "ppt/presentation.xml" distArchive - presElement <- parseXMLDoc $ UTF8.toStringLazy $ fromEntry entry + presElement <- either (const Nothing) return $ + parseXMLElement $ UTF8.toTextLazy $ fromEntry entry let ns = elemToNameSpaces presElement sldSize <- findChild (elemName ns "p" "sldSz") presElement cxS <- findAttr (QName "cx" Nothing Nothing) sldSize diff --git a/src/Text/Pandoc/XMLParser.hs b/src/Text/Pandoc/XMLParser.hs new file mode 100644 index 000000000..8ad22a66a --- /dev/null +++ b/src/Text/Pandoc/XMLParser.hs @@ -0,0 +1,66 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.XMLParser + Copyright : Copyright (C) 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Bridge to allow using xml-conduit's parser with xml-light's types. +-} +module Text.Pandoc.XMLParser + ( parseXMLElement + , parseXMLContents + , module Text.XML.Light.Types + ) where + +import qualified Control.Exception as E +import qualified Text.XML as Conduit +import Text.XML.Unresolved (InvalidEventStream(..)) +import qualified Text.XML.Light as Light +import Text.XML.Light.Types +import qualified Data.Text as T +import qualified Data.Text.Lazy as TL +import qualified Data.Map as M +import Data.Maybe (mapMaybe) + +-- Drop in replacement for parseXMLDoc in xml-light. +parseXMLElement :: TL.Text -> Either T.Text Light.Element +parseXMLElement t = + elementToElement . Conduit.documentRoot <$> + either (Left . T.pack . E.displayException) Right + (Conduit.parseText Conduit.def{ Conduit.psRetainNamespaces = True } t) + +parseXMLContents :: TL.Text -> Either T.Text [Light.Content] +parseXMLContents t = + case Conduit.parseText Conduit.def{ Conduit.psRetainNamespaces = True } t of + Left e -> + case E.fromException e of + Just (ContentAfterRoot _) -> + elContent <$> parseXMLElement ("" <> t <> "") + _ -> Left . T.pack . E.displayException $ e + Right x -> Right [Light.Elem . elementToElement . Conduit.documentRoot $ x] + +elementToElement :: Conduit.Element -> Light.Element +elementToElement (Conduit.Element name attribMap nodes) = + Light.Element (nameToQname name) attrs (mapMaybe nodeToContent nodes) Nothing + where + attrs = map (\(n,v) -> Light.Attr (nameToQname n) (T.unpack v)) $ + M.toList attribMap + nameToQname (Conduit.Name localName mbns mbpref) = + case mbpref of + Nothing | "xmlns:" `T.isPrefixOf` localName -> + Light.QName (T.unpack $ T.drop 6 localName) (T.unpack <$> mbns) + (Just "xmlns") + _ -> Light.QName (T.unpack localName) (T.unpack <$> mbns) + (T.unpack <$> mbpref) + +nodeToContent :: Conduit.Node -> Maybe Light.Content +nodeToContent (Conduit.NodeElement el) = + Just (Light.Elem (elementToElement el)) +nodeToContent (Conduit.NodeContent t) = + Just (Light.Text (Light.CData Light.CDataText (T.unpack t) Nothing)) +nodeToContent _ = Nothing + diff --git a/test/Tests/Readers/JATS.hs b/test/Tests/Readers/JATS.hs index 525499c86..a9c9a0586 100644 --- a/test/Tests/Readers/JATS.hs +++ b/test/Tests/Readers/JATS.hs @@ -88,6 +88,7 @@ tests = [ testGroup "inline code" "

\n\ \ \n\ \ \n\ + \ \n\ \

" =?> para (math "\\sigma|_{\\{x\\}}") , test jats "math ml only" $ diff --git a/test/command/5321.md b/test/command/5321.md index 081abe2a0..83404632a 100644 --- a/test/command/5321.md +++ b/test/command/5321.md @@ -4,7 +4,7 @@

bar

- + ^D [Para [Image ("fig-1",[],[]) [Str "bar"] ("foo.png","fig:")]] @@ -17,7 +17,7 @@ foo

bar

- + ^D [Para [Image ("fig-1",[],[]) [Str "foo",LineBreak,Str "bar"] ("foo.png","fig:")]] diff --git a/test/docbook-reader.docbook b/test/docbook-reader.docbook index 02568d8de..5717d78d0 100644 --- a/test/docbook-reader.docbook +++ b/test/docbook-reader.docbook @@ -1,6 +1,11 @@ +"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd" +[ + + + +]>
Pandoc Test Suite diff --git a/test/docx/golden/block_quotes.docx b/test/docx/golden/block_quotes.docx index 3e1bf16e7..d3b16d0f2 100644 Binary files a/test/docx/golden/block_quotes.docx and b/test/docx/golden/block_quotes.docx differ diff --git a/test/docx/golden/codeblock.docx b/test/docx/golden/codeblock.docx index 66f055063..6293ef493 100644 Binary files a/test/docx/golden/codeblock.docx and b/test/docx/golden/codeblock.docx differ diff --git a/test/docx/golden/comments.docx b/test/docx/golden/comments.docx index fb3a02a0a..4205a1516 100644 Binary files a/test/docx/golden/comments.docx and b/test/docx/golden/comments.docx differ diff --git a/test/docx/golden/custom_style_no_reference.docx b/test/docx/golden/custom_style_no_reference.docx index bc6c2702a..adb3f23db 100644 Binary files a/test/docx/golden/custom_style_no_reference.docx and b/test/docx/golden/custom_style_no_reference.docx differ diff --git a/test/docx/golden/custom_style_preserve.docx b/test/docx/golden/custom_style_preserve.docx index 8c555a5bd..92c8137fe 100644 Binary files a/test/docx/golden/custom_style_preserve.docx and b/test/docx/golden/custom_style_preserve.docx differ diff --git a/test/docx/golden/custom_style_reference.docx b/test/docx/golden/custom_style_reference.docx index 5f96cc911..f53470617 100644 Binary files a/test/docx/golden/custom_style_reference.docx and b/test/docx/golden/custom_style_reference.docx differ diff --git a/test/docx/golden/definition_list.docx b/test/docx/golden/definition_list.docx index c21b3a5b3..d6af90a72 100644 Binary files a/test/docx/golden/definition_list.docx and b/test/docx/golden/definition_list.docx differ diff --git a/test/docx/golden/document-properties-short-desc.docx b/test/docx/golden/document-properties-short-desc.docx index 92ce144e9..e18dbe853 100644 Binary files a/test/docx/golden/document-properties-short-desc.docx and b/test/docx/golden/document-properties-short-desc.docx differ diff --git a/test/docx/golden/document-properties.docx b/test/docx/golden/document-properties.docx index d21b67309..820299043 100644 Binary files a/test/docx/golden/document-properties.docx and b/test/docx/golden/document-properties.docx differ diff --git a/test/docx/golden/headers.docx b/test/docx/golden/headers.docx index 3558a47bf..ae0f41d12 100644 Binary files a/test/docx/golden/headers.docx and b/test/docx/golden/headers.docx differ diff --git a/test/docx/golden/image.docx b/test/docx/golden/image.docx index 606df92a3..94cd35dfa 100644 Binary files a/test/docx/golden/image.docx and b/test/docx/golden/image.docx differ diff --git a/test/docx/golden/inline_code.docx b/test/docx/golden/inline_code.docx index 759269cac..879f2a25b 100644 Binary files a/test/docx/golden/inline_code.docx and b/test/docx/golden/inline_code.docx differ diff --git a/test/docx/golden/inline_formatting.docx b/test/docx/golden/inline_formatting.docx index c37777080..93f86478f 100644 Binary files a/test/docx/golden/inline_formatting.docx and b/test/docx/golden/inline_formatting.docx differ diff --git a/test/docx/golden/inline_images.docx b/test/docx/golden/inline_images.docx index 9450b1a73..967d297f2 100644 Binary files a/test/docx/golden/inline_images.docx and b/test/docx/golden/inline_images.docx differ diff --git a/test/docx/golden/link_in_notes.docx b/test/docx/golden/link_in_notes.docx index 6f0b830e6..c5614e2fa 100644 Binary files a/test/docx/golden/link_in_notes.docx and b/test/docx/golden/link_in_notes.docx differ diff --git a/test/docx/golden/links.docx b/test/docx/golden/links.docx index e53889cfb..0f39a831f 100644 Binary files a/test/docx/golden/links.docx and b/test/docx/golden/links.docx differ diff --git a/test/docx/golden/lists.docx b/test/docx/golden/lists.docx index 5dbe298b7..07046f223 100644 Binary files a/test/docx/golden/lists.docx and b/test/docx/golden/lists.docx differ diff --git a/test/docx/golden/lists_continuing.docx b/test/docx/golden/lists_continuing.docx index 194181288..3656618e6 100644 Binary files a/test/docx/golden/lists_continuing.docx and b/test/docx/golden/lists_continuing.docx differ diff --git a/test/docx/golden/lists_multiple_initial.docx b/test/docx/golden/lists_multiple_initial.docx index 6e0b634f7..8798253d5 100644 Binary files a/test/docx/golden/lists_multiple_initial.docx and b/test/docx/golden/lists_multiple_initial.docx differ diff --git a/test/docx/golden/lists_restarting.docx b/test/docx/golden/lists_restarting.docx index 477178e77..0a24d1840 100644 Binary files a/test/docx/golden/lists_restarting.docx and b/test/docx/golden/lists_restarting.docx differ diff --git a/test/docx/golden/nested_anchors_in_header.docx b/test/docx/golden/nested_anchors_in_header.docx index 51110356e..52bb7a217 100644 Binary files a/test/docx/golden/nested_anchors_in_header.docx and b/test/docx/golden/nested_anchors_in_header.docx differ diff --git a/test/docx/golden/notes.docx b/test/docx/golden/notes.docx index b6206cdf5..182c06c64 100644 Binary files a/test/docx/golden/notes.docx and b/test/docx/golden/notes.docx differ diff --git a/test/docx/golden/raw-blocks.docx b/test/docx/golden/raw-blocks.docx index 07b576080..7b69a56a3 100644 Binary files a/test/docx/golden/raw-blocks.docx and b/test/docx/golden/raw-blocks.docx differ diff --git a/test/docx/golden/raw-bookmarks.docx b/test/docx/golden/raw-bookmarks.docx index d46095eb7..3d3a35701 100644 Binary files a/test/docx/golden/raw-bookmarks.docx and b/test/docx/golden/raw-bookmarks.docx differ diff --git a/test/docx/golden/table_one_row.docx b/test/docx/golden/table_one_row.docx index 7caba4e93..5ae37b406 100644 Binary files a/test/docx/golden/table_one_row.docx and b/test/docx/golden/table_one_row.docx differ diff --git a/test/docx/golden/table_with_list_cell.docx b/test/docx/golden/table_with_list_cell.docx index 6aaa6da61..c29aa6716 100644 Binary files a/test/docx/golden/table_with_list_cell.docx and b/test/docx/golden/table_with_list_cell.docx differ diff --git a/test/docx/golden/tables.docx b/test/docx/golden/tables.docx index 5746c5ad0..664493246 100644 Binary files a/test/docx/golden/tables.docx and b/test/docx/golden/tables.docx differ diff --git a/test/docx/golden/track_changes_deletion.docx b/test/docx/golden/track_changes_deletion.docx index 5f22dccc6..b6d15340e 100644 Binary files a/test/docx/golden/track_changes_deletion.docx and b/test/docx/golden/track_changes_deletion.docx differ diff --git a/test/docx/golden/track_changes_insertion.docx b/test/docx/golden/track_changes_insertion.docx index ab5c4f56d..f8e1092d2 100644 Binary files a/test/docx/golden/track_changes_insertion.docx and b/test/docx/golden/track_changes_insertion.docx differ diff --git a/test/docx/golden/track_changes_move.docx b/test/docx/golden/track_changes_move.docx index 085f33162..b4cda82f2 100644 Binary files a/test/docx/golden/track_changes_move.docx and b/test/docx/golden/track_changes_move.docx differ diff --git a/test/docx/golden/track_changes_scrubbed_metadata.docx b/test/docx/golden/track_changes_scrubbed_metadata.docx index 1ac86d5c8..ee222efa0 100644 Binary files a/test/docx/golden/track_changes_scrubbed_metadata.docx and b/test/docx/golden/track_changes_scrubbed_metadata.docx differ diff --git a/test/docx/golden/unicode.docx b/test/docx/golden/unicode.docx index c2c443b19..c6f8d9c96 100644 Binary files a/test/docx/golden/unicode.docx and b/test/docx/golden/unicode.docx differ diff --git a/test/docx/golden/verbatim_subsuper.docx b/test/docx/golden/verbatim_subsuper.docx index 5ea18d32e..ea8146690 100644 Binary files a/test/docx/golden/verbatim_subsuper.docx and b/test/docx/golden/verbatim_subsuper.docx differ diff --git a/test/jats-reader.native b/test/jats-reader.native index ab77dd1a0..0715ea8cc 100644 --- a/test/jats-reader.native +++ b/test/jats-reader.native @@ -1,4 +1,4 @@ -Pandoc (Meta {unMeta = fromList [("author",MetaList [MetaInlines [Str "John",Space,Str "MacFarlane"]]),("title",MetaInlines [Str "Pandoc",Space,Str "Test",Space,Str "Suite"])]}) +Pandoc (Meta {unMeta = fromList [("author",MetaList [MetaInlines [Str "John",Space,Str "MacFarlane"],MetaInlines [Str "Anonymous"]]),("title",MetaInlines [Str "Pandoc",Space,Str "Test",Space,Str "Suite"])]}) [Para [Str "This",Space,Str "is",Space,Str "a",Space,Str "set",Space,Str "of",Space,Str "tests",Space,Str "for",Space,Str "pandoc.",Space,Str "Most",Space,Str "of",Space,Str "them",Space,Str "are",Space,Str "adapted",Space,Str "from",Space,Str "John",Space,Str "Gruber's",Space,Str "markdown",Space,Str "test",Space,Str "suite."] ,Header 1 ("headers",[],[]) [Str "Headers"] ,Header 2 ("level-2-with-an-embedded-link",[],[]) [Str "Level",Space,Str "2",Space,Str "with",Space,Str "an",SoftBreak,Link ("",[],[]) [Str "embedded",SoftBreak,Str "link"] ("/url","")] diff --git a/test/jats-reader.xml b/test/jats-reader.xml index f75b3e95a..f33cb9ab3 100644 --- a/test/jats-reader.xml +++ b/test/jats-reader.xml @@ -20,6 +20,7 @@ MacFarlane John + Anonymous diff --git a/test/pptx/code-custom.pptx b/test/pptx/code-custom.pptx index aa9b7692a..58070eb3f 100644 Binary files a/test/pptx/code-custom.pptx and b/test/pptx/code-custom.pptx differ diff --git a/test/pptx/code-custom_templated.pptx b/test/pptx/code-custom_templated.pptx index 9aaef4cb5..db9b7e371 100644 Binary files a/test/pptx/code-custom_templated.pptx and b/test/pptx/code-custom_templated.pptx differ diff --git a/test/pptx/code.pptx b/test/pptx/code.pptx index 1737ec757..c7b1ed7d5 100644 Binary files a/test/pptx/code.pptx and b/test/pptx/code.pptx differ diff --git a/test/pptx/code_templated.pptx b/test/pptx/code_templated.pptx index 87fb560ef..6944d92bf 100644 Binary files a/test/pptx/code_templated.pptx and b/test/pptx/code_templated.pptx differ diff --git a/test/pptx/document-properties-short-desc.pptx b/test/pptx/document-properties-short-desc.pptx index 961c31020..ae0d28429 100644 Binary files a/test/pptx/document-properties-short-desc.pptx and b/test/pptx/document-properties-short-desc.pptx differ diff --git a/test/pptx/document-properties-short-desc_templated.pptx b/test/pptx/document-properties-short-desc_templated.pptx index 894738ef7..37c74c69a 100644 Binary files a/test/pptx/document-properties-short-desc_templated.pptx and b/test/pptx/document-properties-short-desc_templated.pptx differ diff --git a/test/pptx/document-properties.pptx b/test/pptx/document-properties.pptx index 188e8d826..324e443a1 100644 Binary files a/test/pptx/document-properties.pptx and b/test/pptx/document-properties.pptx differ diff --git a/test/pptx/document-properties_templated.pptx b/test/pptx/document-properties_templated.pptx index 253e8c0a7..c81b983e3 100644 Binary files a/test/pptx/document-properties_templated.pptx and b/test/pptx/document-properties_templated.pptx differ diff --git a/test/pptx/endnotes.pptx b/test/pptx/endnotes.pptx index e230420d2..30ce33db6 100644 Binary files a/test/pptx/endnotes.pptx and b/test/pptx/endnotes.pptx differ diff --git a/test/pptx/endnotes_templated.pptx b/test/pptx/endnotes_templated.pptx index 49384fd65..d6c604968 100644 Binary files a/test/pptx/endnotes_templated.pptx and b/test/pptx/endnotes_templated.pptx differ diff --git a/test/pptx/endnotes_toc.pptx b/test/pptx/endnotes_toc.pptx index cdf1be4ad..000e17ecd 100644 Binary files a/test/pptx/endnotes_toc.pptx and b/test/pptx/endnotes_toc.pptx differ diff --git a/test/pptx/endnotes_toc_templated.pptx b/test/pptx/endnotes_toc_templated.pptx index c4fcbad45..fdcd2e29b 100644 Binary files a/test/pptx/endnotes_toc_templated.pptx and b/test/pptx/endnotes_toc_templated.pptx differ diff --git a/test/pptx/images.pptx b/test/pptx/images.pptx index 4a13b5b7f..e73126376 100644 Binary files a/test/pptx/images.pptx and b/test/pptx/images.pptx differ diff --git a/test/pptx/images_templated.pptx b/test/pptx/images_templated.pptx index 7a6e9700e..e3f968e9e 100644 Binary files a/test/pptx/images_templated.pptx and b/test/pptx/images_templated.pptx differ diff --git a/test/pptx/inline_formatting.pptx b/test/pptx/inline_formatting.pptx index 926c8ff3f..eadb9372e 100644 Binary files a/test/pptx/inline_formatting.pptx and b/test/pptx/inline_formatting.pptx differ diff --git a/test/pptx/inline_formatting_templated.pptx b/test/pptx/inline_formatting_templated.pptx index 16f48e182..8ca6bab2b 100644 Binary files a/test/pptx/inline_formatting_templated.pptx and b/test/pptx/inline_formatting_templated.pptx differ diff --git a/test/pptx/lists.pptx b/test/pptx/lists.pptx index f47b17a74..ae188ee68 100644 Binary files a/test/pptx/lists.pptx and b/test/pptx/lists.pptx differ diff --git a/test/pptx/lists_templated.pptx b/test/pptx/lists_templated.pptx index 88109a95e..60301fa50 100644 Binary files a/test/pptx/lists_templated.pptx and b/test/pptx/lists_templated.pptx differ diff --git a/test/pptx/raw_ooxml.pptx b/test/pptx/raw_ooxml.pptx index 84020708f..17124a50d 100644 Binary files a/test/pptx/raw_ooxml.pptx and b/test/pptx/raw_ooxml.pptx differ diff --git a/test/pptx/raw_ooxml_templated.pptx b/test/pptx/raw_ooxml_templated.pptx index a2f77e945..19ae7dd4e 100644 Binary files a/test/pptx/raw_ooxml_templated.pptx and b/test/pptx/raw_ooxml_templated.pptx differ diff --git a/test/pptx/remove_empty_slides.pptx b/test/pptx/remove_empty_slides.pptx index 48bf7bc8a..b650b7585 100644 Binary files a/test/pptx/remove_empty_slides.pptx and b/test/pptx/remove_empty_slides.pptx differ diff --git a/test/pptx/remove_empty_slides_templated.pptx b/test/pptx/remove_empty_slides_templated.pptx index 23b134a5f..0ab029614 100644 Binary files a/test/pptx/remove_empty_slides_templated.pptx and b/test/pptx/remove_empty_slides_templated.pptx differ diff --git a/test/pptx/slide_breaks.pptx b/test/pptx/slide_breaks.pptx index d6eebeffb..2a6e35080 100644 Binary files a/test/pptx/slide_breaks.pptx and b/test/pptx/slide_breaks.pptx differ diff --git a/test/pptx/slide_breaks_slide_level_1.pptx b/test/pptx/slide_breaks_slide_level_1.pptx index a6c76a187..a7bcf6a4b 100644 Binary files a/test/pptx/slide_breaks_slide_level_1.pptx and b/test/pptx/slide_breaks_slide_level_1.pptx differ diff --git a/test/pptx/slide_breaks_slide_level_1_templated.pptx b/test/pptx/slide_breaks_slide_level_1_templated.pptx index 1fbde815b..21b018c25 100644 Binary files a/test/pptx/slide_breaks_slide_level_1_templated.pptx and b/test/pptx/slide_breaks_slide_level_1_templated.pptx differ diff --git a/test/pptx/slide_breaks_templated.pptx b/test/pptx/slide_breaks_templated.pptx index cb3af4aa1..4ec4772a4 100644 Binary files a/test/pptx/slide_breaks_templated.pptx and b/test/pptx/slide_breaks_templated.pptx differ diff --git a/test/pptx/slide_breaks_toc.pptx b/test/pptx/slide_breaks_toc.pptx index dff386885..5983657b6 100644 Binary files a/test/pptx/slide_breaks_toc.pptx and b/test/pptx/slide_breaks_toc.pptx differ diff --git a/test/pptx/slide_breaks_toc_templated.pptx b/test/pptx/slide_breaks_toc_templated.pptx index 43b125f5e..dd54c7082 100644 Binary files a/test/pptx/slide_breaks_toc_templated.pptx and b/test/pptx/slide_breaks_toc_templated.pptx differ diff --git a/test/pptx/speaker_notes.pptx b/test/pptx/speaker_notes.pptx index 3314a1c65..b3e5ed5b9 100644 Binary files a/test/pptx/speaker_notes.pptx and b/test/pptx/speaker_notes.pptx differ diff --git a/test/pptx/speaker_notes_after_metadata.pptx b/test/pptx/speaker_notes_after_metadata.pptx index 27a136838..1078854bb 100644 Binary files a/test/pptx/speaker_notes_after_metadata.pptx and b/test/pptx/speaker_notes_after_metadata.pptx differ diff --git a/test/pptx/speaker_notes_after_metadata_templated.pptx b/test/pptx/speaker_notes_after_metadata_templated.pptx index 7aa3b6a87..5116c6c4e 100644 Binary files a/test/pptx/speaker_notes_after_metadata_templated.pptx and b/test/pptx/speaker_notes_after_metadata_templated.pptx differ diff --git a/test/pptx/speaker_notes_afterheader.pptx b/test/pptx/speaker_notes_afterheader.pptx index d43709ca7..0c8e49bd9 100644 Binary files a/test/pptx/speaker_notes_afterheader.pptx and b/test/pptx/speaker_notes_afterheader.pptx differ diff --git a/test/pptx/speaker_notes_afterheader_templated.pptx b/test/pptx/speaker_notes_afterheader_templated.pptx index 793ea10f6..68695939d 100644 Binary files a/test/pptx/speaker_notes_afterheader_templated.pptx and b/test/pptx/speaker_notes_afterheader_templated.pptx differ diff --git a/test/pptx/speaker_notes_afterseps.pptx b/test/pptx/speaker_notes_afterseps.pptx index 2f4d3b820..7ed9b946d 100644 Binary files a/test/pptx/speaker_notes_afterseps.pptx and b/test/pptx/speaker_notes_afterseps.pptx differ diff --git a/test/pptx/speaker_notes_afterseps_templated.pptx b/test/pptx/speaker_notes_afterseps_templated.pptx index 94a221398..79fc82345 100644 Binary files a/test/pptx/speaker_notes_afterseps_templated.pptx and b/test/pptx/speaker_notes_afterseps_templated.pptx differ diff --git a/test/pptx/speaker_notes_templated.pptx b/test/pptx/speaker_notes_templated.pptx index 22040c88c..9f943c279 100644 Binary files a/test/pptx/speaker_notes_templated.pptx and b/test/pptx/speaker_notes_templated.pptx differ diff --git a/test/pptx/start_numbering_at.pptx b/test/pptx/start_numbering_at.pptx index 18477380b..ac72d8ced 100644 Binary files a/test/pptx/start_numbering_at.pptx and b/test/pptx/start_numbering_at.pptx differ diff --git a/test/pptx/start_numbering_at_templated.pptx b/test/pptx/start_numbering_at_templated.pptx index 4b9d0ba4d..15c7b5469 100644 Binary files a/test/pptx/start_numbering_at_templated.pptx and b/test/pptx/start_numbering_at_templated.pptx differ diff --git a/test/pptx/tables.pptx b/test/pptx/tables.pptx index 1c5b54185..926c5e699 100644 Binary files a/test/pptx/tables.pptx and b/test/pptx/tables.pptx differ diff --git a/test/pptx/tables_templated.pptx b/test/pptx/tables_templated.pptx index 1314f4de4..a37e72d2c 100644 Binary files a/test/pptx/tables_templated.pptx and b/test/pptx/tables_templated.pptx differ diff --git a/test/pptx/two_column.pptx b/test/pptx/two_column.pptx index 9018be36e..7f86533fe 100644 Binary files a/test/pptx/two_column.pptx and b/test/pptx/two_column.pptx differ diff --git a/test/pptx/two_column_templated.pptx b/test/pptx/two_column_templated.pptx index 35e93af67..89e3db0ab 100644 Binary files a/test/pptx/two_column_templated.pptx and b/test/pptx/two_column_templated.pptx differ -- cgit v1.2.3 From 967e7f5fb990b29de48b37be1db40fb149a8cf55 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 14 Feb 2021 22:29:21 -0800 Subject: Rename Text.Pandoc.XMLParser -> Text.Pandoc.XML.Light... ..and add new definitions isomorphic to xml-light's, but with Text instead of String. This allows us to keep most of the code in existing readers that use xml-light, but avoid lots of unnecessary allocation. We also add versions of the functions from xml-light's Text.XML.Light.Output and Text.XML.Light.Proc that operate on our modified XML types, and functions that convert xml-light types to our types (since some of our dependencies, like texmath, use xml-light). Update golden tests for docx and pptx. OOXML test: Use `showContent` instead of `ppContent` in `displayDiff`. Docx: Do a manual traversal to unwrap sdt and smartTag. This is faster, and needed to pass the tests. Benchmarks: A = prior to 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8) B = as of 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8) C = this commit | Reader | A | B | C | | ------- | ----- | ------ | ----- | | docbook | 18 ms | 12 ms | 10 ms | | opml | 65 ms | 62 ms | 35 ms | | jats | 15 ms | 11 ms | 9 ms | | docx | 72 ms | 69 ms | 44 ms | | odt | 78 ms | 41 ms | 28 ms | | epub | 64 ms | 61 ms | 56 ms | | fb2 | 14 ms | 5 ms | 4 ms | --- .hlint.yaml | 1 + pandoc.cabal | 2 +- src/Text/Pandoc/ImageSize.hs | 9 +- src/Text/Pandoc/Readers/DocBook.hs | 76 +-- src/Text/Pandoc/Readers/Docx/Parse.hs | 163 +++--- src/Text/Pandoc/Readers/Docx/Parse/Styles.hs | 31 +- src/Text/Pandoc/Readers/Docx/Util.hs | 27 +- src/Text/Pandoc/Readers/EPUB.hs | 65 +-- src/Text/Pandoc/Readers/FB2.hs | 93 ++-- src/Text/Pandoc/Readers/JATS.hs | 58 +- src/Text/Pandoc/Readers/OPML.hs | 29 +- src/Text/Pandoc/Readers/Odt.hs | 5 +- src/Text/Pandoc/Readers/Odt/ContentReader.hs | 13 +- src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs | 3 +- src/Text/Pandoc/Readers/Odt/Generic/Utils.hs | 33 +- .../Pandoc/Readers/Odt/Generic/XMLConverter.hs | 23 +- src/Text/Pandoc/Readers/Odt/Namespaces.hs | 11 +- src/Text/Pandoc/Readers/Odt/StyleReader.hs | 23 +- src/Text/Pandoc/Writers/Docx.hs | 263 +++++---- src/Text/Pandoc/Writers/EPUB.hs | 356 ++++++------- src/Text/Pandoc/Writers/FB2.hs | 127 ++--- src/Text/Pandoc/Writers/ODT.hs | 16 +- src/Text/Pandoc/Writers/OOXML.hs | 35 +- src/Text/Pandoc/Writers/Powerpoint/Output.hs | 201 +++---- src/Text/Pandoc/XML/Light.hs | 586 +++++++++++++++++++++ src/Text/Pandoc/XMLParser.hs | 66 --- test/Tests/Writers/OOXML.hs | 3 +- test/docx/golden/block_quotes.docx | Bin 10098 -> 10071 bytes test/docx/golden/codeblock.docx | Bin 9950 -> 9920 bytes test/docx/golden/comments.docx | Bin 10285 -> 10258 bytes test/docx/golden/custom_style_no_reference.docx | Bin 10048 -> 10021 bytes test/docx/golden/custom_style_preserve.docx | Bin 10673 -> 10650 bytes test/docx/golden/custom_style_reference.docx | Bin 12434 -> 12403 bytes test/docx/golden/definition_list.docx | Bin 9947 -> 9920 bytes .../golden/document-properties-short-desc.docx | Bin 9953 -> 9925 bytes test/docx/golden/document-properties.docx | Bin 10429 -> 10404 bytes test/docx/golden/headers.docx | Bin 10086 -> 10059 bytes test/docx/golden/image.docx | Bin 26764 -> 26736 bytes test/docx/golden/inline_code.docx | Bin 9886 -> 9859 bytes test/docx/golden/inline_formatting.docx | Bin 10066 -> 10038 bytes test/docx/golden/inline_images.docx | Bin 26822 -> 26793 bytes test/docx/golden/link_in_notes.docx | Bin 10107 -> 10081 bytes test/docx/golden/links.docx | Bin 10282 -> 10251 bytes test/docx/golden/lists.docx | Bin 10358 -> 10332 bytes test/docx/golden/lists_continuing.docx | Bin 10149 -> 10123 bytes test/docx/golden/lists_multiple_initial.docx | Bin 10238 -> 10210 bytes test/docx/golden/lists_restarting.docx | Bin 10150 -> 10122 bytes test/docx/golden/nested_anchors_in_header.docx | Bin 10245 -> 10216 bytes test/docx/golden/notes.docx | Bin 10052 -> 10028 bytes test/docx/golden/raw-blocks.docx | Bin 9986 -> 9960 bytes test/docx/golden/raw-bookmarks.docx | Bin 10121 -> 10094 bytes test/docx/golden/table_one_row.docx | Bin 9938 -> 9908 bytes test/docx/golden/table_with_list_cell.docx | Bin 10255 -> 10227 bytes test/docx/golden/tables.docx | Bin 10272 -> 10244 bytes test/docx/golden/track_changes_deletion.docx | Bin 9930 -> 9903 bytes test/docx/golden/track_changes_insertion.docx | Bin 9913 -> 9886 bytes test/docx/golden/track_changes_move.docx | Bin 9947 -> 9920 bytes .../golden/track_changes_scrubbed_metadata.docx | Bin 10059 -> 10032 bytes test/docx/golden/unicode.docx | Bin 9871 -> 9845 bytes test/docx/golden/verbatim_subsuper.docx | Bin 9919 -> 9892 bytes test/pptx/code-custom.pptx | Bin 28221 -> 28184 bytes test/pptx/code-custom_templated.pptx | Bin 395516 -> 395477 bytes test/pptx/code.pptx | Bin 28220 -> 28183 bytes test/pptx/code_templated.pptx | Bin 395514 -> 395477 bytes test/pptx/document-properties-short-desc.pptx | Bin 27004 -> 26967 bytes .../document-properties-short-desc_templated.pptx | Bin 394288 -> 394253 bytes test/pptx/document-properties.pptx | Bin 27408 -> 27375 bytes test/pptx/document-properties_templated.pptx | Bin 394691 -> 394656 bytes test/pptx/endnotes.pptx | Bin 26962 -> 26928 bytes test/pptx/endnotes_templated.pptx | Bin 394253 -> 394219 bytes test/pptx/endnotes_toc.pptx | Bin 27789 -> 27747 bytes test/pptx/endnotes_toc_templated.pptx | Bin 395083 -> 395041 bytes test/pptx/images.pptx | Bin 44619 -> 44579 bytes test/pptx/images_templated.pptx | Bin 411909 -> 411870 bytes test/pptx/inline_formatting.pptx | Bin 26148 -> 26121 bytes test/pptx/inline_formatting_templated.pptx | Bin 393438 -> 393412 bytes test/pptx/lists.pptx | Bin 27049 -> 27015 bytes test/pptx/lists_templated.pptx | Bin 394340 -> 394307 bytes test/pptx/raw_ooxml.pptx | Bin 26940 -> 26908 bytes test/pptx/raw_ooxml_templated.pptx | Bin 394231 -> 394198 bytes test/pptx/remove_empty_slides.pptx | Bin 44065 -> 44025 bytes test/pptx/remove_empty_slides_templated.pptx | Bin 411352 -> 411311 bytes test/pptx/slide_breaks.pptx | Bin 28575 -> 28531 bytes test/pptx/slide_breaks_slide_level_1.pptx | Bin 27744 -> 27705 bytes .../pptx/slide_breaks_slide_level_1_templated.pptx | Bin 395038 -> 395000 bytes test/pptx/slide_breaks_templated.pptx | Bin 395868 -> 395825 bytes test/pptx/slide_breaks_toc.pptx | Bin 29532 -> 29481 bytes test/pptx/slide_breaks_toc_templated.pptx | Bin 396826 -> 396776 bytes test/pptx/speaker_notes.pptx | Bin 35436 -> 35360 bytes test/pptx/speaker_notes_after_metadata.pptx | Bin 31675 -> 31636 bytes .../speaker_notes_after_metadata_templated.pptx | Bin 398955 -> 398915 bytes test/pptx/speaker_notes_afterheader.pptx | Bin 30691 -> 30657 bytes test/pptx/speaker_notes_afterheader_templated.pptx | Bin 397979 -> 397943 bytes test/pptx/speaker_notes_afterseps.pptx | Bin 51604 -> 51548 bytes test/pptx/speaker_notes_afterseps_templated.pptx | Bin 418896 -> 418834 bytes test/pptx/speaker_notes_templated.pptx | Bin 402728 -> 402650 bytes test/pptx/start_numbering_at.pptx | Bin 27023 -> 26991 bytes test/pptx/start_numbering_at_templated.pptx | Bin 394314 -> 394283 bytes test/pptx/tables.pptx | Bin 27566 -> 27532 bytes test/pptx/tables_templated.pptx | Bin 394859 -> 394827 bytes test/pptx/two_column.pptx | Bin 26065 -> 26038 bytes test/pptx/two_column_templated.pptx | Bin 393355 -> 393327 bytes 102 files changed, 1388 insertions(+), 930 deletions(-) create mode 100644 src/Text/Pandoc/XML/Light.hs delete mode 100644 src/Text/Pandoc/XMLParser.hs (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/.hlint.yaml b/.hlint.yaml index d5ebffd34..e482b2b37 100644 --- a/.hlint.yaml +++ b/.hlint.yaml @@ -14,6 +14,7 @@ - ignore: {name: "Reduce duplication"} # TODO: could be more fine-grained - ignore: {name: "Use &&&"} - ignore: {name: "Use String"} +- ignore: {name: "Use camelCase"} - ignore: {name: "Use fmap"} # specific for GHC 7.8 compat - ignore: {name: "Use isDigit"} diff --git a/pandoc.cabal b/pandoc.cabal index 3c7063f6c..22aebd55e 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -687,7 +687,7 @@ library Text.Pandoc.Lua.PandocLua, Text.Pandoc.Lua.Util, Text.Pandoc.Lua.Walk, - Text.Pandoc.XMLParser, + Text.Pandoc.XML.Light, Text.Pandoc.CSS, Text.Pandoc.CSV, Text.Pandoc.RoffChar, diff --git a/src/Text/Pandoc/ImageSize.hs b/src/Text/Pandoc/ImageSize.hs index e0a1af8e8..bb1aa6351 100644 --- a/src/Text/Pandoc/ImageSize.hs +++ b/src/Text/Pandoc/ImageSize.hs @@ -44,8 +44,7 @@ import Numeric (showFFloat) import Text.Pandoc.Definition import Text.Pandoc.Options import qualified Text.Pandoc.UTF8 as UTF8 -import qualified Text.XML.Light as Xml -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light hiding (Attr) import qualified Data.Text as T import qualified Data.Text.Lazy as TL import qualified Data.Text.Encoding as TE @@ -332,12 +331,12 @@ svgSize opts img = do doc <- either (const mzero) return $ parseXMLElement $ TL.fromStrict $ UTF8.toText img let viewboxSize = do - vb <- Xml.findAttrBy (== Xml.QName "viewBox" Nothing Nothing) doc - [_,_,w,h] <- mapM safeRead (T.words (T.pack vb)) + vb <- findAttrBy (== QName "viewBox" Nothing Nothing) doc + [_,_,w,h] <- mapM safeRead (T.words vb) return (w,h) let dpi = fromIntegral $ writerDpi opts let dirToInt dir = do - dim <- Xml.findAttrBy (== Xml.QName dir Nothing Nothing) doc >>= lengthToDim . T.pack + dim <- findAttrBy (== QName dir Nothing Nothing) doc >>= lengthToDim return $ inPixel opts dim w <- dirToInt "width" <|> (fst <$> viewboxSize) h <- dirToInt "height" <|> (snd <$> viewboxSize) diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index ad0108843..e201b54fe 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -31,8 +31,7 @@ import Text.Pandoc.Options import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces) import Text.TeXMath (readMathML, writeTeX) -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLContents) +import Text.Pandoc.XML.Light {- @@ -578,26 +577,27 @@ normalizeTree = everywhere (mkT go) where go :: [Content] -> [Content] go (Text (CData CDataRaw _ _):xs) = xs go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs + Text (CData CDataText (s1 <> s2) z):xs go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs + Text (CData CDataText (s1 <> convertEntity r) z):xs go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs + Text (CData CDataText (convertEntity r <> s1) z):xs go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + Text (CData CDataText (convertEntity r1 <> + convertEntity r2) Nothing):xs go xs = xs -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) +convertEntity :: Text -> Text +convertEntity e = maybe (T.map toUpper e) T.pack (lookupEntity $ T.unpack e) -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr elt = - maybe "" T.pack (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) + fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) -- convenience function named :: Text -> Element -> Bool -named s e = qName (elName e) == T.unpack s +named s e = qName (elName e) == s -- @@ -634,7 +634,7 @@ isBlockElement :: Content -> Bool isBlockElement (Elem e) = qName (elName e) `elem` blockTags isBlockElement _ = False -blockTags :: [String] +blockTags :: [Text] blockTags = [ "abstract" , "ackno" @@ -721,7 +721,7 @@ blockTags = , "variablelist" ] ++ admonitionTags -admonitionTags :: [String] +admonitionTags :: [Text] admonitionTags = ["important","caution","note","tip","warning"] -- Trim leading and trailing newline characters @@ -779,10 +779,10 @@ getBlocks e = mconcat <$> parseBlock :: PandocMonad m => Content -> DB m Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE -parseBlock (Text (CData _ s _)) = if all isSpace s +parseBlock (Text (CData _ s _)) = if T.all isSpace s then return mempty - else return $ plain $ trimInlines $ text $ T.pack s -parseBlock (CRef x) = return $ plain $ str $ T.toUpper $ T.pack x + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ T.toUpper x parseBlock (Elem e) = case qName (elName e) of "toc" -> skip -- skip TOC, since in pandoc it's autogenerated @@ -837,7 +837,7 @@ parseBlock (Elem e) = "refsect2" -> sect 2 "refsect3" -> sect 3 "refsection" -> gets dbSectionLevel >>= sect . (+1) - l | l `elem` admonitionTags -> parseAdmonition $ T.pack l + l | l `elem` admonitionTags -> parseAdmonition l "area" -> skip "areaset" -> skip "areaspec" -> skip @@ -899,7 +899,7 @@ parseBlock (Elem e) = "subtitle" -> return mempty -- handled in parent element _ -> skip >> getBlocks e where skip = do - let qn = T.pack $ qName $ elName e + let qn = qName $ elName e let name = if "pi-" `T.isPrefixOf` qn then " qn <> "?>" else qn @@ -911,7 +911,7 @@ parseBlock (Elem e) = "" -> [] x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) - $ trimNl $ T.pack $ strContentRecursive e + $ trimNl $ strContentRecursive e parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty @@ -965,7 +965,7 @@ parseBlock (Elem e) = w <- findAttr (unqual "colwidth") c n <- safeRead $ "0" <> T.filter (\x -> (x >= '0' && x <= '9') - || x == '.') (T.pack w) + || x == '.') w if n > 0 then Just n else Nothing let numrows = case bodyrows of [] -> 0 @@ -1048,12 +1048,12 @@ parseMixed container conts = do x <- parseMixed container rs return $ p <> b <> x -parseRow :: PandocMonad m => [String] -> Element -> DB m [Cell] +parseRow :: PandocMonad m => [Text] -> Element -> DB m [Cell] parseRow cn = do let isEntry x = named "entry" x || named "td" x || named "th" x mapM (parseEntry cn) . filterChildren isEntry -parseEntry :: PandocMonad m => [String] -> Element -> DB m Cell +parseEntry :: PandocMonad m => [Text] -> Element -> DB m Cell parseEntry cn el = do let colDistance sa ea = do let iStrt = elemIndex sa cn @@ -1075,7 +1075,7 @@ getInlines :: PandocMonad m => Element -> DB m Inlines getInlines e' = trimInlines . mconcat <$> mapM parseInline (elContent e') -strContentRecursive :: Element -> String +strContentRecursive :: Element -> Text strContentRecursive = strContent . (\e' -> e'{ elContent = map elementToStr $ elContent e' }) @@ -1084,9 +1084,9 @@ elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing elementToStr x = x parseInline :: PandocMonad m => Content -> DB m Inlines -parseInline (Text (CData _ s _)) = return $ text $ T.pack s +parseInline (Text (CData _ s _)) = return $ text s parseInline (CRef ref) = - return $ text $ maybe (T.toUpper $ T.pack ref) T.pack $ lookupEntity ref + return $ text $ maybe (T.toUpper ref) T.pack $ lookupEntity (T.unpack ref) parseInline (Elem e) = case qName (elName e) of "anchor" -> do @@ -1138,7 +1138,7 @@ parseInline (Elem e) = "userinput" -> codeWithLang "systemitem" -> codeWithLang "varargs" -> return $ code "(...)" - "keycap" -> return (str $ T.pack $ strContent e) + "keycap" -> return (str $ strContent e) "keycombo" -> keycombo <$> mapM parseInline (elContent e) "menuchoice" -> menuchoice <$> @@ -1150,17 +1150,17 @@ parseInline (Elem e) = let title = case attrValue "endterm" e of "" -> maybe "???" xrefTitleByElem (findElementById linkend content) - endterm -> maybe "???" (T.pack . strContent) + endterm -> maybe "???" strContent (findElementById endterm content) return $ link ("#" <> linkend) "" (text title) - "email" -> return $ link ("mailto:" <> T.pack (strContent e)) "" - $ str $ T.pack $ strContent e - "uri" -> return $ link (T.pack $ strContent e) "" $ str $ T.pack $ strContent e + "email" -> return $ link ("mailto:" <> strContent e) "" + $ str $ strContent e + "uri" -> return $ link (strContent e) "" $ str $ strContent e "ulink" -> innerInlines (link (attrValue "url" e) "") "link" -> do ils <- innerInlines id let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of - Just h -> T.pack h + Just h -> h _ -> "#" <> attrValue "linkend" e let ils' = if ils == mempty then str href else ils let attr = (attrValue "id" e, T.words $ attrValue "role" e, []) @@ -1180,7 +1180,7 @@ parseInline (Elem e) = "pi-asciidoc-br" -> return linebreak _ -> skip >> innerInlines id where skip = do - let qn = T.pack $ qName $ elName e + let qn = qName $ elName e let name = if "pi-" `T.isPrefixOf` qn then " qn <> "?>" else qn @@ -1193,7 +1193,7 @@ parseInline (Elem e) = let classes' = case attrValue "language" e of "" -> [] l -> [l] - return $ codeWith (attrValue "id" e,classes',[]) $ T.pack $ strContentRecursive e + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e simpleList = mconcat . intersperse (str "," <> space) <$> mapM getInlines (filterChildren (named "member") e) segmentedList = do @@ -1234,10 +1234,10 @@ parseInline (Elem e) = "sect5" -> descendantContent "title" el "cmdsynopsis" -> descendantContent "command" el "funcsynopsis" -> descendantContent "function" el - _ -> T.pack $ qName (elName el) ++ "_title" + _ -> qName (elName el) <> "_title" where xrefLabel = attrValue "xreflabel" el - descendantContent name = maybe "???" (T.pack . strContent) + descendantContent name = maybe "???" strContent . filterElementName (\n -> qName n == name) -- | Extract a math equation from an element @@ -1258,7 +1258,7 @@ equation e constructor = mathMLEquations :: [Text] mathMLEquations = map writeTeX $ rights $ readMath (\x -> qName (elName x) == "math" && qPrefix (elName x) == Just "mml") - (readMathML . T.pack . showElement) + (readMathML . showElement) latexEquations :: [Text] latexEquations = readMath (\x -> qName (elName x) == "mathphrase") @@ -1272,8 +1272,8 @@ equation e constructor = -- | Get the actual text stored in a CData block. 'showContent' -- returns the text still surrounded by the [[CDATA]] tags. showVerbatimCData :: Content -> Text -showVerbatimCData (Text (CData _ d _)) = T.pack d -showVerbatimCData c = T.pack $ showContent c +showVerbatimCData (Text (CData _ d _)) = d +showVerbatimCData c = showContent c -- | Set the prefix of a name to 'Nothing' diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 056dab6c2..c76f3c171 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -63,6 +63,7 @@ import Data.Char (chr, ord, readLitChar) import Data.List import qualified Data.Map as M import qualified Data.Text as T +import Data.Text (Text) import Data.Maybe import System.FilePath import Text.Pandoc.Readers.Docx.Util @@ -72,9 +73,7 @@ import qualified Text.Pandoc.UTF8 as UTF8 import Text.TeXMath (Exp) import Text.TeXMath.Readers.OMML (readOMML) import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, textToFont) -import Text.XML.Light -import qualified Text.XML.Light.Cursor as XMLC -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light data ReaderEnv = ReaderEnv { envNotes :: Notes , envComments :: Comments @@ -128,37 +127,23 @@ mapD f xs = in concatMapM handler xs -unwrap :: NameSpaces -> Content -> [Content] -unwrap ns (Elem element) +unwrapElement :: NameSpaces -> Element -> [Element] +unwrapElement ns element | isElem ns "w" "sdt" element , Just sdtContent <- findChildByName ns "w" "sdtContent" element - = concatMap (unwrap ns . Elem) (elChildren sdtContent) + = concatMap (unwrapElement ns) (elChildren sdtContent) | isElem ns "w" "smartTag" element - = concatMap (unwrap ns . Elem) (elChildren element) -unwrap _ content = [content] + = concatMap (unwrapElement ns) (elChildren element) + | otherwise + = [element{ elContent = concatMap (unwrapContent ns) (elContent element) }] -unwrapChild :: NameSpaces -> Content -> Content -unwrapChild ns (Elem element) = - Elem $ element { elContent = concatMap (unwrap ns) (elContent element) } -unwrapChild _ content = content +unwrapContent :: NameSpaces -> Content -> [Content] +unwrapContent ns (Elem element) = map Elem $ unwrapElement ns element +unwrapContent _ content = [content] -walkDocument' :: NameSpaces -> XMLC.Cursor -> XMLC.Cursor -walkDocument' ns cur = - let modifiedCur = XMLC.modifyContent (unwrapChild ns) cur - in - case XMLC.nextDF modifiedCur of - Just cur' -> walkDocument' ns cur' - Nothing -> XMLC.root modifiedCur - -walkDocument :: NameSpaces -> Element -> Maybe Element +walkDocument :: NameSpaces -> Element -> Element walkDocument ns element = - let cur = XMLC.fromContent (Elem element) - cur' = walkDocument' ns cur - in - case XMLC.toTree cur' of - Elem element' -> Just element' - _ -> Nothing - + element{ elContent = concatMap (unwrapContent ns) (elContent element) } newtype Docx = Docx Document deriving Show @@ -361,9 +346,9 @@ getDocumentXmlPath zf = do fp <- findAttr (QName "Target" Nothing Nothing) rel -- sometimes there will be a leading slash, which windows seems to -- have trouble with. - return $ case fp of + return $ case T.unpack fp of '/' : fp' -> fp' - _ -> fp + fp' -> fp' archiveToDocument :: Archive -> D Document archiveToDocument zf = do @@ -372,7 +357,7 @@ archiveToDocument zf = do docElem <- maybeToD $ parseXMLFromEntry entry let namespaces = elemToNameSpaces docElem bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem - let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem) + let bodyElem' = walkDocument namespaces bodyElem body <- elemToBody namespaces bodyElem' return $ Document namespaces body @@ -414,8 +399,8 @@ archiveToNotes zf = fn_namespaces = maybe [] elemToNameSpaces fnElem en_namespaces = maybe [] elemToNameSpaces enElem ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces - fn = fnElem >>= walkDocument ns >>= elemToNotes ns "footnote" - en = enElem >>= walkDocument ns >>= elemToNotes ns "endnote" + fn = fnElem >>= elemToNotes ns "footnote" . walkDocument ns + en = enElem >>= elemToNotes ns "endnote" . walkDocument ns in Notes ns fn en @@ -424,7 +409,8 @@ archiveToComments zf = let cmtsElem = findEntryByPath "word/comments.xml" zf >>= parseXMLFromEntry cmts_namespaces = maybe [] elemToNameSpaces cmtsElem - cmts = elemToComments cmts_namespaces <$> (cmtsElem >>= walkDocument cmts_namespaces) + cmts = elemToComments cmts_namespaces . walkDocument cmts_namespaces <$> + cmtsElem in case cmts of Just c -> Comments cmts_namespaces c @@ -443,8 +429,8 @@ filePathToRelType path docXmlPath = relElemToRelationship :: DocumentLocation -> Element -> Maybe Relationship relElemToRelationship relType element | qName (elName element) == "Relationship" = do - relId <- findAttrText (QName "Id" Nothing Nothing) element - target <- findAttrText (QName "Target" Nothing Nothing) element + relId <- findAttr (QName "Id" Nothing Nothing) element + target <- findAttr (QName "Target" Nothing Nothing) element return $ Relationship relType relId target relElemToRelationship _ _ = Nothing @@ -485,10 +471,10 @@ lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do loElemToLevelOverride :: NameSpaces -> Element -> Maybe LevelOverride loElemToLevelOverride ns element | isElem ns "w" "lvlOverride" element = do - ilvl <- findAttrTextByName ns "w" "ilvl" element + ilvl <- findAttrByName ns "w" "ilvl" element let startOverride = findChildByName ns "w" "startOverride" element >>= findAttrByName ns "w" "val" - >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + >>= stringToInteger lvl = findChildByName ns "w" "lvl" element >>= levelElemToLevel ns return $ LevelOverride ilvl startOverride lvl @@ -497,9 +483,9 @@ loElemToLevelOverride _ _ = Nothing numElemToNum :: NameSpaces -> Element -> Maybe Numb numElemToNum ns element | isElem ns "w" "num" element = do - numId <- findAttrTextByName ns "w" "numId" element + numId <- findAttrByName ns "w" "numId" element absNumId <- findChildByName ns "w" "abstractNumId" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" let lvlOverrides = mapMaybe (loElemToLevelOverride ns) (findChildrenByName ns "w" "lvlOverride" element) @@ -509,7 +495,7 @@ numElemToNum _ _ = Nothing absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb absNumElemToAbsNum ns element | isElem ns "w" "abstractNum" element = do - absNumId <- findAttrTextByName ns "w" "abstractNumId" element + absNumId <- findAttrByName ns "w" "abstractNumId" element let levelElems = findChildrenByName ns "w" "lvl" element levels = mapMaybe (levelElemToLevel ns) levelElems return $ AbstractNumb absNumId levels @@ -518,14 +504,14 @@ absNumElemToAbsNum _ _ = Nothing levelElemToLevel :: NameSpaces -> Element -> Maybe Level levelElemToLevel ns element | isElem ns "w" "lvl" element = do - ilvl <- findAttrTextByName ns "w" "ilvl" element + ilvl <- findAttrByName ns "w" "ilvl" element fmt <- findChildByName ns "w" "numFmt" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" txt <- findChildByName ns "w" "lvlText" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" let start = findChildByName ns "w" "start" element >>= findAttrByName ns "w" "val" - >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + >>= stringToInteger return (Level ilvl fmt txt start) levelElemToLevel _ _ = Nothing @@ -546,11 +532,11 @@ archiveToNumbering :: Archive -> Numbering archiveToNumbering archive = fromMaybe (Numbering [] [] []) (archiveToNumbering' archive) -elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map T.Text Element) +elemToNotes :: NameSpaces -> Text -> Element -> Maybe (M.Map T.Text Element) elemToNotes ns notetype element | isElem ns "w" (notetype <> "s") element = let pairs = mapMaybe - (\e -> findAttrTextByName ns "w" "id" e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) (findChildrenByName ns "w" notetype element) in @@ -562,7 +548,7 @@ elemToComments :: NameSpaces -> Element -> M.Map T.Text Element elemToComments ns element | isElem ns "w" "comments" element = let pairs = mapMaybe - (\e -> findAttrTextByName ns "w" "id" e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) (findChildrenByName ns "w" "comment" element) in @@ -622,12 +608,12 @@ elemToParIndentation ns element | isElem ns "w" "ind" element = stringToInteger , hangingParIndent = findAttrByName ns "w" "hanging" element >>= - stringToInteger} + stringToInteger } elemToParIndentation _ _ = Nothing -testBitMask :: String -> Int -> Bool +testBitMask :: Text -> Int -> Bool testBitMask bitMaskS n = - case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of + case (reads ("0x" ++ T.unpack bitMaskS) :: [(Int, String)]) of [] -> False ((n', _) : _) -> (n' .|. n) /= 0 @@ -642,7 +628,7 @@ elemToBodyPart ns element | isElem ns "w" "p" element , (c:_) <- findChildrenByName ns "m" "oMathPara" element = do - expsLst <- eitherToD $ readOMML $ T.pack $ showElement c + expsLst <- eitherToD $ readOMML $ showElement c return $ OMathPara expsLst elemToBodyPart ns element | isElem ns "w" "p" element @@ -666,7 +652,7 @@ elemToBodyPart ns element | isElem ns "w" "tbl" element = do let caption' = findChildByName ns "w" "tblPr" element >>= findChildByName ns "w" "tblCaption" - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" caption = fromMaybe "" caption' grid' = case findChildByName ns "w" "tblGrid" element of Just g -> elemToTblGrid ns g @@ -705,8 +691,8 @@ getTitleAndAlt :: NameSpaces -> Element -> (T.Text, T.Text) getTitleAndAlt ns element = let mbDocPr = findChildByName ns "wp" "inline" element >>= findChildByName ns "wp" "docPr" - title = fromMaybe "" (mbDocPr >>= findAttrTextByName ns "" "title") - alt = fromMaybe "" (mbDocPr >>= findAttrTextByName ns "" "descr") + title = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "title") + alt = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "descr") in (title, alt) elemToParPart :: NameSpaces -> Element -> D ParPart @@ -718,7 +704,7 @@ elemToParPart ns element = let (title, alt) = getTitleAndAlt ns drawingElem a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrTextByName ns "r" "embed" + >>= findAttrByName ns "r" "embed" in case drawing of Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) @@ -728,7 +714,7 @@ elemToParPart ns element | isElem ns "w" "r" element , Just _ <- findChildByName ns "w" "pict" element = let drawing = findElement (elemName ns "v" "imagedata") element - >>= findAttrTextByName ns "r" "id" + >>= findAttrByName ns "r" "id" in case drawing of -- Todo: check out title and attr for deprecated format. @@ -797,7 +783,7 @@ elemToParPart ns element fldCharState <- gets stateFldCharState case fldCharState of FldCharOpen -> do - info <- eitherToD $ parseFieldInfo $ T.pack $ strContent instrText + info <- eitherToD $ parseFieldInfo $ strContent instrText modify $ \st -> st{stateFldCharState = FldCharFieldInfo info} return NullParPart _ -> return NullParPart @@ -818,48 +804,48 @@ elemToParPart ns element return $ ChangedRuns change runs elemToParPart ns element | isElem ns "w" "bookmarkStart" element - , Just bmId <- findAttrTextByName ns "w" "id" element - , Just bmName <- findAttrTextByName ns "w" "name" element = + , Just bmId <- findAttrByName ns "w" "id" element + , Just bmName <- findAttrByName ns "w" "name" element = return $ BookMark bmId bmName elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just relId <- findAttrTextByName ns "r" "id" element = do + , Just relId <- findAttrByName ns "r" "id" element = do location <- asks envLocation runs <- mapD (elemToRun ns) (elChildren element) rels <- asks envRelationships case lookupRelationship location relId rels of Just target -> - case findAttrTextByName ns "w" "anchor" element of + case findAttrByName ns "w" "anchor" element of Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) runs Nothing -> return $ ExternalHyperLink target runs Nothing -> return $ ExternalHyperLink "" runs elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just anchor <- findAttrTextByName ns "w" "anchor" element = do + , Just anchor <- findAttrByName ns "w" "anchor" element = do runs <- mapD (elemToRun ns) (elChildren element) return $ InternalHyperLink anchor runs elemToParPart ns element | isElem ns "w" "commentRangeStart" element - , Just cmtId <- findAttrTextByName ns "w" "id" element = do + , Just cmtId <- findAttrByName ns "w" "id" element = do (Comments _ commentMap) <- asks envComments case M.lookup cmtId commentMap of Just cmtElem -> elemToCommentStart ns cmtElem Nothing -> throwError WrongElem elemToParPart ns element | isElem ns "w" "commentRangeEnd" element - , Just cmtId <- findAttrTextByName ns "w" "id" element = + , Just cmtId <- findAttrByName ns "w" "id" element = return $ CommentEnd cmtId elemToParPart ns element | isElem ns "m" "oMath" element = - fmap PlainOMath (eitherToD $ readOMML $ T.pack $ showElement element) + fmap PlainOMath (eitherToD $ readOMML $ showElement element) elemToParPart _ _ = throwError WrongElem elemToCommentStart :: NameSpaces -> Element -> D ParPart elemToCommentStart ns element | isElem ns "w" "comment" element - , Just cmtId <- findAttrTextByName ns "w" "id" element - , Just cmtAuthor <- findAttrTextByName ns "w" "author" element - , cmtDate <- findAttrTextByName ns "w" "date" element = do + , Just cmtId <- findAttrByName ns "w" "id" element + , Just cmtAuthor <- findAttrByName ns "w" "author" element + , cmtDate <- findAttrByName ns "w" "date" element = do bps <- mapD (elemToBodyPart ns) (elChildren element) return $ CommentStart cmtId cmtAuthor cmtDate bps elemToCommentStart _ _ = throwError WrongElem @@ -878,7 +864,7 @@ elemToExtent drawingElem = where wp_ns = "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem - >>= findAttr (QName at Nothing Nothing) >>= safeRead . T.pack + >>= findAttr (QName at Nothing Nothing) >>= safeRead childElemToRun :: NameSpaces -> Element -> D Run @@ -889,7 +875,7 @@ childElemToRun ns element = let (title, alt) = getTitleAndAlt ns element a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrText (QName "embed" (lookup "r" ns) (Just "r")) + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) in case drawing of Just s -> expandDrawingId s >>= @@ -902,7 +888,7 @@ childElemToRun ns element = return InlineChart childElemToRun ns element | isElem ns "w" "footnoteReference" element - , Just fnId <- findAttrTextByName ns "w" "id" element = do + , Just fnId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupFootnote fnId notes of Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -910,7 +896,7 @@ childElemToRun ns element Nothing -> return $ Footnote [] childElemToRun ns element | isElem ns "w" "endnoteReference" element - , Just enId <- findAttrTextByName ns "w" "id" element = do + , Just enId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupEndnote enId notes of Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -963,15 +949,15 @@ getParStyleField _ _ = Nothing getTrackedChange :: NameSpaces -> Element -> Maybe TrackedChange getTrackedChange ns element | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element - , Just cId <- findAttrTextByName ns "w" "id" element - , Just cAuthor <- findAttrTextByName ns "w" "author" element - , mcDate <- findAttrTextByName ns "w" "date" element = + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , mcDate <- findAttrByName ns "w" "date" element = Just $ TrackedChange Insertion (ChangeInfo cId cAuthor mcDate) getTrackedChange ns element | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element - , Just cId <- findAttrTextByName ns "w" "id" element - , Just cAuthor <- findAttrTextByName ns "w" "author" element - , mcDate <- findAttrTextByName ns "w" "date" element = + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , mcDate <- findAttrByName ns "w" "date" element = Just $ TrackedChange Deletion (ChangeInfo cId cAuthor mcDate) getTrackedChange _ _ = Nothing @@ -980,7 +966,7 @@ elemToParagraphStyle ns element sty | Just pPr <- findChildByName ns "w" "pPr" element = let style = mapMaybe - (fmap ParaStyleId . findAttrTextByName ns "w" "val") + (fmap ParaStyleId . findAttrByName ns "w" "val") (findChildrenByName ns "w" "pStyle" pPr) in ParagraphStyle {pStyle = mapMaybe (`M.lookup` sty) style @@ -1012,7 +998,7 @@ elemToRunStyleD ns element charStyles <- asks envCharStyles let parentSty = findChildByName ns "w" "rStyle" rPr >>= - findAttrTextByName ns "w" "val" >>= + findAttrByName ns "w" "val" >>= flip M.lookup charStyles . CharStyleId return $ elemToRunStyle ns element parentSty elemToRunStyleD _ _ = return defaultRunStyle @@ -1022,7 +1008,7 @@ elemToRunElem ns element | isElem ns "w" "t" element || isElem ns "w" "delText" element || isElem ns "m" "t" element = do - let str = T.pack $ strContent element + let str = strContent element font <- asks envFont case font of Nothing -> return $ TextRun str @@ -1044,14 +1030,14 @@ getSymChar :: NameSpaces -> Element -> RunElem getSymChar ns element | Just s <- lowerFromPrivate <$> getCodepoint , Just font <- getFont = - case readLitChar ("\\x" ++ s) of + case readLitChar ("\\x" ++ T.unpack s) of [(char, _)] -> TextRun . maybe "" T.singleton $ getUnicode font char _ -> TextRun "" where getCodepoint = findAttrByName ns "w" "char" element - getFont = textToFont . T.pack =<< findAttrByName ns "w" "font" element - lowerFromPrivate ('F':xs) = '0':xs - lowerFromPrivate xs = xs + getFont = textToFont =<< findAttrByName ns "w" "font" element + lowerFromPrivate t | "F" `T.isPrefixOf` t = "0" <> T.drop 1 t + | otherwise = t getSymChar _ _ = TextRun "" elemToRunElems :: NameSpaces -> Element -> D [RunElem] @@ -1061,8 +1047,9 @@ elemToRunElems ns element let qualName = elemName ns "w" let font = do fontElem <- findElement (qualName "rFonts") element - textToFont . T.pack =<< - foldr ((<|>) . (flip findAttr fontElem . qualName)) Nothing ["ascii", "hAnsi"] + textToFont =<< + foldr ((<|>) . (flip findAttr fontElem . qualName)) + Nothing ["ascii", "hAnsi"] local (setFont font) (mapD (elemToRunElem ns) (elChildren element)) elemToRunElems _ _ = throwError WrongElem diff --git a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs index edade8654..0d7271d6a 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs @@ -48,12 +48,13 @@ import Data.Function (on) import Data.String (IsString(..)) import qualified Data.Map as M import qualified Data.Text as T +import qualified Data.Text.Read +import Data.Text (Text) import Data.Maybe import Data.Coerce import Text.Pandoc.Readers.Docx.Util import qualified Text.Pandoc.UTF8 as UTF8 -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light newtype CharStyleId = CharStyleId T.Text deriving (Show, Eq, Ord, IsString, FromStyleId) @@ -109,7 +110,7 @@ data RunStyle = RunStyle { isBold :: Maybe Bool , isRTL :: Maybe Bool , isForceCTL :: Maybe Bool , rVertAlign :: Maybe VertAlign - , rUnderline :: Maybe String + , rUnderline :: Maybe Text , rParentStyle :: Maybe CharStyle } deriving Show @@ -159,7 +160,7 @@ isBasedOnStyle ns element parentStyle , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle , Just basedOnVal <- findChildByName ns "w" "basedOn" element >>= - findAttrTextByName ns "w" "val" + findAttrByName ns "w" "val" , Just ps <- parentStyle = basedOnVal == fromStyleId (getStyleId ps) | isElem ns "w" "style" element , Just styleType <- findAttrByName ns "w" "type" element @@ -169,7 +170,7 @@ isBasedOnStyle ns element parentStyle | otherwise = False class HasStyleId a => ElemToStyle a where - cStyleType :: Maybe a -> String + cStyleType :: Maybe a -> Text elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a class FromStyleId (StyleId a) => HasStyleId a where @@ -226,8 +227,10 @@ buildBasedOnList ns element rootStyle = stys -> stys ++ concatMap (buildBasedOnList ns element . Just) stys -stringToInteger :: String -> Maybe Integer -stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) +stringToInteger :: Text -> Maybe Integer +stringToInteger s = case Data.Text.Read.decimal s of + Right (x,_) -> Just x + Left _ -> Nothing checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool checkOnOff ns rPr tag @@ -247,7 +250,7 @@ checkOnOff _ _ _ = Nothing elemToCharStyle :: NameSpaces -> Element -> Maybe CharStyle -> Maybe CharStyle elemToCharStyle ns element parentStyle - = CharStyle <$> (CharStyleId <$> findAttrTextByName ns "w" "styleId" element) + = CharStyle <$> (CharStyleId <$> findAttrByName ns "w" "styleId" element) <*> getElementStyleName ns element <*> Just (elemToRunStyle ns element parentStyle) @@ -281,7 +284,7 @@ elemToRunStyle _ _ _ = defaultRunStyle getHeaderLevel :: NameSpaces -> Element -> Maybe (ParaStyleName, Int) getHeaderLevel ns element | Just styleName <- getElementStyleName ns element - , Just n <- stringToInteger . T.unpack =<< + , Just n <- stringToInteger =<< (T.stripPrefix "heading " . T.toLower $ fromStyleName styleName) , n > 0 = Just (styleName, fromInteger n) @@ -289,8 +292,8 @@ getHeaderLevel _ _ = Nothing getElementStyleName :: Coercible T.Text a => NameSpaces -> Element -> Maybe a getElementStyleName ns el = coerce <$> - ((findChildByName ns "w" "name" el >>= findAttrTextByName ns "w" "val") - <|> findAttrTextByName ns "w" "styleId" el) + ((findChildByName ns "w" "name" el >>= findAttrByName ns "w" "val") + <|> findAttrByName ns "w" "styleId" el) getNumInfo :: NameSpaces -> Element -> Maybe (T.Text, T.Text) getNumInfo ns element = do @@ -298,15 +301,15 @@ getNumInfo ns element = do findChildByName ns "w" "numPr" lvl = fromMaybe "0" (numPr >>= findChildByName ns "w" "ilvl" >>= - findAttrTextByName ns "w" "val") + findAttrByName ns "w" "val") numId <- numPr >>= findChildByName ns "w" "numId" >>= - findAttrTextByName ns "w" "val" + findAttrByName ns "w" "val" return (numId, lvl) elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> Maybe ParStyle elemToParStyleData ns element parentStyle - | Just styleId <- findAttrTextByName ns "w" "styleId" element + | Just styleId <- findAttrByName ns "w" "styleId" element , Just styleName <- getElementStyleName ns element = Just $ ParStyle { diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index f9c9a8e26..21df03d9e 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Docx.StyleMaps Copyright : © 2014-2020 Jesse Rosenthal , @@ -18,51 +19,45 @@ module Text.Pandoc.Readers.Docx.Util ( , elemToNameSpaces , findChildByName , findChildrenByName - , findAttrText , findAttrByName - , findAttrTextByName ) where import Data.Maybe (mapMaybe) import qualified Data.Text as T -import Text.XML.Light +import Data.Text (Text) +import Text.Pandoc.XML.Light -type NameSpaces = [(String, String)] +type NameSpaces = [(Text, Text)] elemToNameSpaces :: Element -> NameSpaces elemToNameSpaces = mapMaybe attrToNSPair . elAttribs -attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair :: Attr -> Maybe (Text, Text) attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) attrToNSPair _ = Nothing -elemName :: NameSpaces -> String -> String -> QName +elemName :: NameSpaces -> Text -> Text -> QName elemName ns prefix name = - QName name (lookup prefix ns) (if null prefix then Nothing else Just prefix) + QName name (lookup prefix ns) (if T.null prefix then Nothing else Just prefix) -isElem :: NameSpaces -> String -> String -> Element -> Bool +isElem :: NameSpaces -> Text -> Text -> Element -> Bool isElem ns prefix name element = let ns' = ns ++ elemToNameSpaces element in qName (elName element) == name && qURI (elName element) == lookup prefix ns' -findChildByName :: NameSpaces -> String -> String -> Element -> Maybe Element +findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element findChildByName ns pref name el = let ns' = ns ++ elemToNameSpaces el in findChild (elemName ns' pref name) el -findChildrenByName :: NameSpaces -> String -> String -> Element -> [Element] +findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element] findChildrenByName ns pref name el = let ns' = ns ++ elemToNameSpaces el in findChildren (elemName ns' pref name) el -findAttrText :: QName -> Element -> Maybe T.Text -findAttrText x = fmap T.pack . findAttr x - -findAttrByName :: NameSpaces -> String -> String -> Element -> Maybe String +findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text findAttrByName ns pref name el = let ns' = ns ++ elemToNameSpaces el in findAttr (elemName ns' pref name) el -findAttrTextByName :: NameSpaces -> String -> String -> Element -> Maybe T.Text -findAttrTextByName a b c = fmap T.pack . findAttrByName a b c diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 369c4f0c9..eb8d2405d 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -23,8 +23,8 @@ import Control.DeepSeq (NFData, deepseq) import Control.Monad (guard, liftM, liftM2, mplus) import Control.Monad.Except (throwError) import qualified Data.ByteString.Lazy as BL (ByteString) -import Data.List (isInfixOf) import qualified Data.Text as T +import Data.Text (Text) import qualified Data.Map as M (Map, elems, fromList, lookup) import Data.Maybe (mapMaybe) import qualified Data.Text.Lazy as TL @@ -40,13 +40,12 @@ import Text.Pandoc.Extensions (Extension (Ext_raw_html), enableExtension) import Text.Pandoc.MIME (MimeType) import Text.Pandoc.Options (ReaderOptions (..)) import Text.Pandoc.Readers.HTML (readHtml) -import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI) +import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI, tshow) import qualified Text.Pandoc.UTF8 as UTF8 (toTextLazy) import Text.Pandoc.Walk (query, walk) -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light -type Items = M.Map String (FilePath, MimeType) +type Items = M.Map Text (FilePath, MimeType) readEPUB :: PandocMonad m => ReaderOptions -> BL.ByteString -> m Pandoc readEPUB opts bytes = case toArchiveOrFail bytes of @@ -126,26 +125,27 @@ imageToPandoc s = B.doc . B.para $ B.image (T.pack s) "" mempty imageMimes :: [MimeType] imageMimes = ["image/gif", "image/jpeg", "image/png"] -type CoverId = String +type CoverId = Text type CoverImage = FilePath -parseManifest :: (PandocMonad m) => Element -> Maybe CoverId -> m (Maybe CoverImage, Items) +parseManifest :: (PandocMonad m) + => Element -> Maybe CoverId -> m (Maybe CoverImage, Items) parseManifest content coverId = do manifest <- findElementE (dfName "manifest") content let items = findChildren (dfName "item") manifest r <- mapM parseItem items let cover = findAttr (emptyName "href") =<< filterChild findCover manifest - return (cover `mplus` coverId, M.fromList r) + return (T.unpack <$> (cover `mplus` coverId), M.fromList r) where - findCover e = maybe False (isInfixOf "cover-image") + findCover e = maybe False (T.isInfixOf "cover-image") (findAttr (emptyName "properties") e) || Just True == liftM2 (==) coverId (findAttr (emptyName "id") e) parseItem e = do uid <- findAttrE (emptyName "id") e href <- findAttrE (emptyName "href") e mime <- findAttrE (emptyName "media-type") e - return (uid, (href, T.pack mime)) + return (uid, (T.unpack href, mime)) parseSpine :: PandocMonad m => Items -> Element -> m [(FilePath, MimeType)] parseSpine is e = do @@ -173,11 +173,11 @@ parseMeta content = do -- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem parseMetaItem :: Element -> Meta -> Meta parseMetaItem e@(stripNamespace . elName -> field) meta = - addMetaField (renameMeta field) (B.str $ T.pack $ strContent e) meta + addMetaField (renameMeta field) (B.str $ strContent e) meta -renameMeta :: String -> T.Text +renameMeta :: Text -> Text renameMeta "creator" = "author" -renameMeta s = T.pack s +renameMeta s = s getManifest :: PandocMonad m => Archive -> m (String, Element) getManifest archive = do @@ -187,7 +187,7 @@ getManifest archive = do ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) as <- fmap (map attrToPair . elAttribs) (findElementE (QName "rootfile" (Just ns) Nothing) docElem) - manifestFile <- mkE "Root not found" (lookup "full-path" as) + manifestFile <- T.unpack <$> mkE "Root not found" (lookup "full-path" as) let rootdir = dropFileName manifestFile --mime <- lookup "media-type" as manifest <- findEntryByPathE manifestFile archive @@ -201,7 +201,8 @@ fixInternalReferences pathToFile = . walk (fixBlockIRs filename) . walk (fixInlineIRs filename) where - (root, T.unpack . escapeURI . T.pack -> filename) = splitFileName pathToFile + (root, T.unpack . escapeURI . T.pack -> filename) = + splitFileName pathToFile fixInlineIRs :: String -> Inline -> Inline fixInlineIRs s (Span as v) = @@ -214,7 +215,7 @@ fixInlineIRs s (Link as is t) = Link (fixAttrs s as) is t fixInlineIRs _ v = v -prependHash :: [T.Text] -> Inline -> Inline +prependHash :: [Text] -> Inline -> Inline prependHash ps l@(Link attr is (url, tit)) | or [s `T.isPrefixOf` url | s <- ps] = Link attr is ("#" <> url, tit) @@ -231,16 +232,17 @@ fixBlockIRs s (CodeBlock as code) = fixBlockIRs _ b = b fixAttrs :: FilePath -> B.Attr -> B.Attr -fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . T.null) cs, removeEPUBAttrs kvs) +fixAttrs s (ident, cs, kvs) = + (addHash s ident, filter (not . T.null) cs, removeEPUBAttrs kvs) -addHash :: String -> T.Text -> T.Text +addHash :: FilePath -> Text -> Text addHash _ "" = "" addHash s ident = T.pack (takeFileName s) <> "#" <> ident -removeEPUBAttrs :: [(T.Text, T.Text)] -> [(T.Text, T.Text)] +removeEPUBAttrs :: [(Text, Text)] -> [(Text, Text)] removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs -isEPUBAttr :: (T.Text, a) -> Bool +isEPUBAttr :: (Text, a) -> Bool isEPUBAttr (k, _) = "epub:" `T.isPrefixOf` k -- Library @@ -257,33 +259,33 @@ uncurry3 f (a, b, c) = f a b c -- Utility -stripNamespace :: QName -> String +stripNamespace :: QName -> Text stripNamespace (QName v _ _) = v -attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair :: Attr -> Maybe (Text, Text) attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val) attrToNSPair _ = Nothing -attrToPair :: Attr -> (String, String) +attrToPair :: Attr -> (Text, Text) attrToPair (Attr (QName name _ _) val) = (name, val) -defaultNameSpace :: Maybe String +defaultNameSpace :: Maybe Text defaultNameSpace = Just "http://www.idpf.org/2007/opf" -dfName :: String -> QName +dfName :: Text -> QName dfName s = QName s defaultNameSpace Nothing -emptyName :: String -> QName +emptyName :: Text -> QName emptyName s = QName s Nothing Nothing -- Convert Maybe interface to Either -findAttrE :: PandocMonad m => QName -> Element -> m String +findAttrE :: PandocMonad m => QName -> Element -> m Text findAttrE q e = mkE "findAttr" $ findAttr q e findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry findEntryByPathE (normalise . unEscapeString -> path) a = - mkE ("No entry on path: " ++ path) $ findEntryByPath path a + mkE ("No entry on path: " <> T.pack path) $ findEntryByPath path a parseXMLDocE :: PandocMonad m => Entry -> m Element parseXMLDocE entry = @@ -293,7 +295,8 @@ parseXMLDocE entry = fp = T.pack $ eRelativePath entry findElementE :: PandocMonad m => QName -> Element -> m Element -findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x +findElementE e x = + mkE ("Unable to find element: " <> tshow e) $ findElement e x -mkE :: PandocMonad m => String -> Maybe a -> m a -mkE s = maybe (throwError . PandocParseError $ T.pack s) return +mkE :: PandocMonad m => Text -> Maybe a -> m a +mkE s = maybe (throwError . PandocParseError $ s) return diff --git a/src/Text/Pandoc/Readers/FB2.hs b/src/Text/Pandoc/Readers/FB2.hs index b804eab4f..66e390bd7 100644 --- a/src/Text/Pandoc/Readers/FB2.hs +++ b/src/Text/Pandoc/Readers/FB2.hs @@ -25,7 +25,6 @@ TODO: module Text.Pandoc.Readers.FB2 ( readFB2 ) where import Control.Monad.Except (throwError) import Control.Monad.State.Strict -import Data.ByteString.Lazy.Char8 ( pack ) import Data.ByteString.Base64.Lazy import Data.Functor import Data.List (intersperse) @@ -42,8 +41,8 @@ import Text.Pandoc.Error import Text.Pandoc.Logging import Text.Pandoc.Options import Text.Pandoc.Shared (crFilter) -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light +import qualified Text.Pandoc.UTF8 as UTF8 type FB2 m = StateT FB2State m @@ -85,12 +84,12 @@ removeHash t = case T.uncons t of Just ('#', xs) -> xs _ -> t -convertEntity :: String -> Text -convertEntity e = maybe (T.toUpper $ T.pack e) T.pack $ lookupEntity e +convertEntity :: Text -> Text +convertEntity e = maybe (T.toUpper e) T.pack $ lookupEntity (T.unpack e) parseInline :: PandocMonad m => Content -> FB2 m Inlines parseInline (Elem e) = - case T.pack $ qName $ elName e of + case qName $ elName e of "strong" -> strong <$> parseStyleType e "emphasis" -> emph <$> parseStyleType e "style" -> parseNamedStyle e @@ -98,12 +97,12 @@ parseInline (Elem e) = "strikethrough" -> strikeout <$> parseStyleType e "sub" -> subscript <$> parseStyleType e "sup" -> superscript <$> parseStyleType e - "code" -> pure $ code $ T.pack $ strContent e + "code" -> pure $ code $ strContent e "image" -> parseInlineImageElement e name -> do report $ IgnoredElement name pure mempty -parseInline (Text x) = pure $ text $ T.pack $ cdData x +parseInline (Text x) = pure $ text $ cdData x parseInline (CRef r) = pure $ str $ convertEntity r parseSubtitle :: PandocMonad m => Element -> FB2 m Blocks @@ -113,7 +112,7 @@ parseSubtitle e = headerWith ("", ["unnumbered"], []) <$> gets fb2SectionLevel < parseRootElement :: PandocMonad m => Element -> FB2 m Blocks parseRootElement e = - case T.pack $ qName $ elName e of + case qName $ elName e of "FictionBook" -> do -- Parse notes before parsing the rest of the content. case filterChild isNotesBody e of @@ -146,7 +145,7 @@ parseNote e = Just sectionId -> do content <- mconcat <$> mapM parseSectionChild (dropTitle $ elChildren e) oldNotes <- gets fb2Notes - modify $ \s -> s { fb2Notes = M.insert ("#" <> T.pack sectionId) content oldNotes } + modify $ \s -> s { fb2Notes = M.insert ("#" <> sectionId) content oldNotes } pure () where isTitle x = qName (elName x) == "title" @@ -158,7 +157,7 @@ parseNote e = -- | Parse a child of @\@ element. parseFictionBookChild :: PandocMonad m => Element -> FB2 m Blocks parseFictionBookChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "stylesheet" -> pure mempty -- stylesheet is ignored "description" -> mempty <$ mapM_ parseDescriptionChild (elChildren e) "body" -> if isNotesBody e @@ -170,7 +169,7 @@ parseFictionBookChild e = -- | Parse a child of @\@ element. parseDescriptionChild :: PandocMonad m => Element -> FB2 m () parseDescriptionChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title-info" -> mapM_ parseTitleInfoChild (elChildren e) "src-title-info" -> pure () -- ignore "document-info" -> pure () @@ -184,7 +183,7 @@ parseDescriptionChild e = -- | Parse a child of @\@ element. parseBodyChild :: PandocMonad m => Element -> FB2 m Blocks parseBodyChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "image" -> parseImageElement e "title" -> header <$> gets fb2SectionLevel <*> parseTitleType (elContent e) "epigraph" -> parseEpigraph e @@ -198,7 +197,10 @@ parseBinaryElement e = (Nothing, _) -> report $ IgnoredElement "binary without id attribute" (Just _, Nothing) -> report $ IgnoredElement "binary without content-type attribute" - (Just filename, contentType) -> insertMedia filename (T.pack <$> contentType) (decodeLenient (pack (strContent e))) + (Just filename, contentType) -> + insertMedia (T.unpack filename) contentType + (decodeLenient + (UTF8.fromTextLazy . TL.fromStrict . strContent $ e)) -- * Type parsers @@ -208,13 +210,13 @@ parseAuthor e = T.unwords . catMaybes <$> mapM parseAuthorChild (elChildren e) parseAuthorChild :: PandocMonad m => Element -> FB2 m (Maybe Text) parseAuthorChild e = - case T.pack $ qName $ elName e of - "first-name" -> pure $ Just $ T.pack $ strContent e - "middle-name" -> pure $ Just $ T.pack $ strContent e - "last-name" -> pure $ Just $ T.pack $ strContent e - "nickname" -> pure $ Just $ T.pack $ strContent e - "home-page" -> pure $ Just $ T.pack $ strContent e - "email" -> pure $ Just $ T.pack $ strContent e + case qName $ elName e of + "first-name" -> pure $ Just $ strContent e + "middle-name" -> pure $ Just $ strContent e + "last-name" -> pure $ Just $ strContent e + "nickname" -> pure $ Just $ strContent e + "home-page" -> pure $ Just $ strContent e + "email" -> pure $ Just $ strContent e name -> do report $ IgnoredElement $ name <> " in author" pure Nothing @@ -238,13 +240,13 @@ parseTitleContent _ = pure Nothing parseImageElement :: PandocMonad m => Element -> FB2 m Blocks parseImageElement e = case href of - Just src -> pure $ para $ imageWith (imgId, [], []) (removeHash $ T.pack src) title alt + Just src -> pure $ para $ imageWith (imgId, [], []) (removeHash src) title alt Nothing -> do report $ IgnoredElement " image without href" pure mempty - where alt = maybe mempty (str . T.pack) $ findAttr (unqual "alt") e - title = maybe "" T.pack $ findAttr (unqual "title") e - imgId = maybe "" T.pack $ findAttr (unqual "id") e + where alt = maybe mempty str $ findAttr (unqual "alt") e + title = fromMaybe "" $ findAttr (unqual "title") e + imgId = fromMaybe "" $ findAttr (unqual "id") e href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e -- | Parse @pType@ @@ -258,7 +260,7 @@ parseCite e = blockQuote . mconcat <$> mapM parseCiteChild (elChildren e) -- | Parse @citeType@ child parseCiteChild :: PandocMonad m => Element -> FB2 m Blocks parseCiteChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "empty-line" -> pure horizontalRule @@ -273,13 +275,13 @@ parsePoem e = mconcat <$> mapM parsePoemChild (elChildren e) parsePoemChild :: PandocMonad m => Element -> FB2 m Blocks parsePoemChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title" -> parseTitle e "subtitle" -> parseSubtitle e "epigraph" -> parseEpigraph e "stanza" -> parseStanza e "text-author" -> para <$> parsePType e - "date" -> pure $ para $ text $ T.pack $ strContent e + "date" -> pure $ para $ text $ strContent e name -> report (UnexpectedXmlElement name "poem") $> mempty parseStanza :: PandocMonad m => Element -> FB2 m Blocks @@ -292,7 +294,7 @@ joinLineBlocks [] = [] parseStanzaChild :: PandocMonad m => Element -> FB2 m Blocks parseStanzaChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title" -> parseTitle e "subtitle" -> parseSubtitle e "v" -> lineBlock . (:[]) <$> parsePType e @@ -302,11 +304,11 @@ parseStanzaChild e = parseEpigraph :: PandocMonad m => Element -> FB2 m Blocks parseEpigraph e = divWith (divId, ["epigraph"], []) . mconcat <$> mapM parseEpigraphChild (elChildren e) - where divId = maybe "" T.pack $ findAttr (unqual "id") e + where divId = fromMaybe "" $ findAttr (unqual "id") e parseEpigraphChild :: PandocMonad m => Element -> FB2 m Blocks parseEpigraphChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "cite" -> parseCite e @@ -320,7 +322,7 @@ parseAnnotation e = mconcat <$> mapM parseAnnotationChild (elChildren e) parseAnnotationChild :: PandocMonad m => Element -> FB2 m Blocks parseAnnotationChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "cite" -> parseCite e @@ -334,14 +336,14 @@ parseSection :: PandocMonad m => Element -> FB2 m Blocks parseSection e = do n <- gets fb2SectionLevel modify $ \st -> st{ fb2SectionLevel = n + 1 } - let sectionId = maybe "" T.pack $ findAttr (unqual "id") e + let sectionId = fromMaybe "" $ findAttr (unqual "id") e bs <- divWith (sectionId, ["section"], []) . mconcat <$> mapM parseSectionChild (elChildren e) modify $ \st -> st{ fb2SectionLevel = n } pure bs parseSectionChild :: PandocMonad m => Element -> FB2 m Blocks parseSectionChild e = - case T.pack $ qName $ elName e of + case qName $ elName e of "title" -> parseBodyChild e "epigraph" -> parseEpigraph e "image" -> parseImageElement e @@ -363,16 +365,16 @@ parseStyleType e = mconcat <$> mapM parseInline (elContent e) parseNamedStyle :: PandocMonad m => Element -> FB2 m Inlines parseNamedStyle e = do content <- mconcat <$> mapM parseNamedStyleChild (elContent e) - let lang = maybeToList $ ("lang",) . T.pack <$> findAttr (QName "lang" Nothing (Just "xml")) e + let lang = maybeToList $ ("lang",) <$> findAttr (QName "lang" Nothing (Just "xml")) e case findAttr (unqual "name") e of - Just name -> pure $ spanWith ("", [T.pack name], lang) content + Just name -> pure $ spanWith ("", [name], lang) content Nothing -> do report $ IgnoredElement "link without required name" pure mempty parseNamedStyleChild :: PandocMonad m => Content -> FB2 m Inlines parseNamedStyleChild (Elem e) = - case T.pack $ qName (elName e) of + case qName (elName e) of "strong" -> strong <$> parseStyleType e "emphasis" -> emph <$> parseStyleType e "style" -> parseNamedStyle e @@ -380,7 +382,7 @@ parseNamedStyleChild (Elem e) = "strikethrough" -> strikeout <$> parseStyleType e "sub" -> subscript <$> parseStyleType e "sup" -> superscript <$> parseStyleType e - "code" -> pure $ code $ T.pack $ strContent e + "code" -> pure $ code $ strContent e "image" -> parseInlineImageElement e name -> do report $ IgnoredElement $ name <> " in style" @@ -392,7 +394,7 @@ parseLinkType :: PandocMonad m => Element -> FB2 m Inlines parseLinkType e = do content <- mconcat <$> mapM parseStyleLinkType (elContent e) notes <- gets fb2Notes - case T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of + case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of Just href -> case findAttr (QName "type" Nothing Nothing) e of Just "note" -> case M.lookup href notes of Nothing -> pure $ link href "" content @@ -419,15 +421,14 @@ parseTable _ = pure mempty -- TODO: tables are not supported yet -- | Parse @title-infoType@ parseTitleInfoChild :: PandocMonad m => Element -> FB2 m () parseTitleInfoChild e = - case T.pack $ qName (elName e) of + case qName (elName e) of "genre" -> pure () "author" -> parseAuthor e >>= \author -> modify (\st -> st {fb2Authors = author:fb2Authors st}) - "book-title" -> modify (setMeta "title" (text $ T.pack $ strContent e)) + "book-title" -> modify (setMeta "title" (text $ strContent e)) "annotation" -> parseAnnotation e >>= modify . setMeta "abstract" "keywords" -> modify (setMeta "keywords" (map (MetaString . trim) $ T.splitOn "," - $ T.pack $ strContent e)) - "date" -> modify (setMeta "date" (text $ T.pack $ strContent e)) + "date" -> modify (setMeta "date" (text $ strContent e)) "coverpage" -> parseCoverPage e "lang" -> pure () "src-lang" -> pure () @@ -441,7 +442,7 @@ parseCoverPage e = Just img -> case href of Just src -> modify (setMeta "cover-image" (MetaString $ removeHash src)) Nothing -> pure () - where href = T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) img + where href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) img Nothing -> pure () -- | Parse @inlineImageType@ element @@ -454,5 +455,5 @@ parseInlineImageElement e = Nothing -> do report $ IgnoredElement "inline image without href" pure mempty - where alt = maybe mempty (str . T.pack) $ findAttr (unqual "alt") e - href = T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e + where alt = maybe mempty str $ findAttr (unqual "alt") e + href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs index dfd343b7a..5353f2001 100644 --- a/src/Text/Pandoc/Readers/JATS.hs +++ b/src/Text/Pandoc/Readers/JATS.hs @@ -16,7 +16,7 @@ module Text.Pandoc.Readers.JATS ( readJATS ) where import Control.Monad.State.Strict import Control.Monad.Except (throwError) import Text.Pandoc.Error (PandocError(..)) -import Data.Char (isDigit, isSpace, toUpper) +import Data.Char (isDigit, isSpace) import Data.Default import Data.Generics import Data.List (foldl', intersperse) @@ -31,8 +31,7 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Options import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces) import Text.TeXMath (readMathML, writeTeX) -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLContents) +import Text.Pandoc.XML.Light import qualified Data.Set as S (fromList, member) import Data.Set ((\\)) @@ -67,29 +66,29 @@ normalizeTree = everywhere (mkT go) where go :: [Content] -> [Content] go (Text (CData CDataRaw _ _):xs) = xs go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs + Text (CData CDataText (s1 <> s2) z):xs go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs + Text (CData CDataText (s1 <> convertEntity r) z):xs go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs + Text (CData CDataText (convertEntity r <> s1) z):xs go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs go xs = xs -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) +convertEntity :: Text -> Text +convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity $ T.unpack e) -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr = fromMaybe "" . maybeAttrValue attr -maybeAttrValue :: String -> Element -> Maybe Text +maybeAttrValue :: Text -> Element -> Maybe Text maybeAttrValue attr elt = - T.pack <$> lookupAttrBy (\x -> qName x == attr) (elAttribs elt) + lookupAttrBy (\x -> qName x == attr) (elAttribs elt) -- convenience function -named :: String -> Element -> Bool +named :: Text -> Element -> Bool named s e = qName (elName e) == s -- @@ -155,10 +154,10 @@ getBlocks e = mconcat <$> parseBlock :: PandocMonad m => Content -> JATS m Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE -parseBlock (Text (CData _ s _)) = if all isSpace s +parseBlock (Text (CData _ s _)) = if T.all isSpace s then return mempty - else return $ plain $ trimInlines $ text $ T.pack s -parseBlock (CRef x) = return $ plain $ str $ T.toUpper $ T.pack x + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ T.toUpper x parseBlock (Elem e) = case qName (elName e) of "p" -> parseMixed para (elContent e) @@ -207,7 +206,7 @@ parseBlock (Elem e) = "" -> [] x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) - $ trimNl $ textContentRecursive e + $ trimNl $ strContentRecursive e parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty @@ -271,7 +270,7 @@ parseBlock (Elem e) = Just "center" -> AlignCenter _ -> AlignDefault let toWidth c = do - w <- findAttrText (unqual "colwidth") c + w <- findAttr (unqual "colwidth") c n <- safeRead $ "0" <> T.filter (\x -> isDigit x || x == '.') w if n > 0 then Just n else Nothing let numrows = foldl' max 0 $ map length bodyrows @@ -442,16 +441,10 @@ parseRef e = do Nothing -> return $ Map.insert "id" (toMetaValue refId) mempty -- TODO handle mixed-citation -findAttrText :: QName -> Element -> Maybe Text -findAttrText x = fmap T.pack . findAttr x - textContent :: Element -> Text -textContent = T.pack . strContent - -textContentRecursive :: Element -> Text -textContentRecursive = T.pack . strContentRecursive +textContent = strContent -strContentRecursive :: Element -> String +strContentRecursive :: Element -> Text strContentRecursive = strContent . (\e' -> e'{ elContent = map elementToStr $ elContent e' }) @@ -460,9 +453,8 @@ elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing elementToStr x = x parseInline :: PandocMonad m => Content -> JATS m Inlines -parseInline (Text (CData _ s _)) = return $ text $ T.pack s -parseInline (CRef ref) = - return . text . maybe (T.toUpper $ T.pack ref) T.pack $ lookupEntity ref +parseInline (Text (CData _ s _)) = return $ text s +parseInline (CRef ref) = return . text . convertEntity $ ref parseInline (Elem e) = case qName (elName e) of "italic" -> innerInlines emph @@ -507,9 +499,9 @@ parseInline (Elem e) = else linkWith attr ("#" <> rid) "" ils "ext-link" -> do ils <- innerInlines id - let title = fromMaybe "" $ findAttrText (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e + let title = fromMaybe "" $ findAttr (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of - Just h -> T.pack h + Just h -> h _ -> "#" <> attrValue "rid" e let ils' = if ils == mempty then str href else ils let attr = (attrValue "id" e, [], []) @@ -529,7 +521,7 @@ parseInline (Elem e) = where innerInlines f = extractSpaces f . mconcat <$> mapM parseInline (elContent e) mathML x = - case readMathML . T.pack . showElement $ everywhere (mkT removePrefix) x of + case readMathML . showElement $ everywhere (mkT removePrefix) x of Left _ -> mempty Right m -> writeTeX m formula constructor = do @@ -547,4 +539,4 @@ parseInline (Elem e) = let classes' = case attrValue "language" e of "" -> [] l -> [l] - return $ codeWith (attrValue "id" e,classes',[]) $ textContentRecursive e + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs index bdadc4dd9..184d5a63f 100644 --- a/src/Text/Pandoc/Readers/OPML.hs +++ b/src/Text/Pandoc/Readers/OPML.hs @@ -13,7 +13,6 @@ Conversion of OPML to 'Pandoc' document. module Text.Pandoc.Readers.OPML ( readOPML ) where import Control.Monad.State.Strict -import Data.Char (toUpper) import Data.Default import Data.Generics import Data.Maybe (fromMaybe) @@ -28,8 +27,7 @@ import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Readers.Markdown (readMarkdown) import Text.Pandoc.Shared (crFilter, blocksToInlines') -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLContents) +import Text.Pandoc.XML.Light import Control.Monad.Except (throwError) type OPML m = StateT OPMLState m @@ -69,25 +67,22 @@ normalizeTree = everywhere (mkT go) where go :: [Content] -> [Content] go (Text (CData CDataRaw _ _):xs) = xs go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs + Text (CData CDataText (s1 <> s2) z):xs go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs + Text (CData CDataText (s1 <> convertEntity r) z):xs go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs + Text (CData CDataText (convertEntity r <> s1) z):xs go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs go xs = xs -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) +convertEntity :: Text -> Text +convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity (T.unpack e)) -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr elt = - maybe "" T.pack (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) - -textContent :: Element -> Text -textContent = T.pack . strContent + fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) -- exceptT :: PandocMonad m => Either PandocError a -> OPML m a -- exceptT = either throwError return @@ -111,11 +106,11 @@ parseBlock :: PandocMonad m => Content -> OPML m Blocks parseBlock (Elem e) = case qName (elName e) of "ownerName" -> mempty <$ modify (\st -> - st{opmlDocAuthors = [text $ textContent e]}) + st{opmlDocAuthors = [text $ strContent e]}) "dateModified" -> mempty <$ modify (\st -> - st{opmlDocDate = text $ textContent e}) + st{opmlDocDate = text $ strContent e}) "title" -> mempty <$ modify (\st -> - st{opmlDocTitle = text $ textContent e}) + st{opmlDocTitle = text $ strContent e}) "outline" -> gets opmlSectionLevel >>= sect . (+1) "?xml" -> return mempty _ -> getBlocks e diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs index 85308deb1..c274b6fd4 100644 --- a/src/Text/Pandoc/Readers/Odt.hs +++ b/src/Text/Pandoc/Readers/Odt.hs @@ -14,8 +14,7 @@ Entry point to the odt reader. module Text.Pandoc.Readers.Odt ( readOdt ) where import Codec.Archive.Zip -import qualified Text.XML.Light as XML -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light import qualified Data.ByteString.Lazy as B @@ -91,7 +90,7 @@ archiveToOdt archive = do -- -entryToXmlElem :: Entry -> Either PandocError XML.Element +entryToXmlElem :: Entry -> Either PandocError Element entryToXmlElem entry = case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of Right x -> Right x diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs index 43c44e7e9..df90880fa 100644 --- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -29,14 +29,14 @@ import Control.Monad ((<=<)) import qualified Data.ByteString.Lazy as B import Data.Foldable (fold) -import Data.List (find, stripPrefix) +import Data.List (find) import qualified Data.Map as M import qualified Data.Text as T import Data.Maybe import Data.Semigroup (First(..), Option(..)) import Text.TeXMath (readMathML, writeTeX) -import qualified Text.XML.Light as XML +import qualified Text.Pandoc.XML.Light as XML import Text.Pandoc.Builder hiding (underline) import Text.Pandoc.MediaBag (MediaBag, insertMedia) @@ -557,7 +557,7 @@ read_plain_text = fst ^&&& read_plain_text' >>% recover >>?% mappend -- extractText :: XML.Content -> Fallible T.Text - extractText (XML.Text cData) = succeedWith (T.pack $ XML.cdData cData) + extractText (XML.Text cData) = succeedWith (XML.cdData cData) extractText _ = failEmpty read_text_seq :: InlineMatcher @@ -777,14 +777,14 @@ read_frame_img = "" -> returnV mempty -< () src' -> do let exts = extensionsFromList [Ext_auto_identifiers] - resource <- lookupResource -< src' + resource <- lookupResource -< T.unpack src' _ <- updateMediaWithResource -< resource w <- findAttrText' NsSVG "width" -< () h <- findAttrText' NsSVG "height" -< () titleNodes <- matchChildContent' [ read_frame_title ] -< () alt <- matchChildContent [] read_plain_text -< () arr (firstMatch . uncurry4 imageWith) -< - (image_attributes w h, T.pack src', inlineListToIdentifier exts (toList titleNodes), alt) + (image_attributes w h, src', inlineListToIdentifier exts (toList titleNodes), alt) read_frame_title :: InlineMatcher read_frame_title = matchingElement NsSVG "title" (matchChildContent [] read_plain_text) @@ -804,7 +804,8 @@ read_frame_mathml = case fold src of "" -> returnV mempty -< () src' -> do - let path = fromMaybe src' (stripPrefix "./" src') ++ "/content.xml" + let path = T.unpack $ + fromMaybe src' (T.stripPrefix "./" src') <> "/content.xml" (_, mathml) <- lookupResource -< path case readMathML (UTF8.toText $ B.toStrict mathml) of Left _ -> returnV mempty -< () diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs index 77174c793..78a7fc0b2 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs @@ -14,9 +14,10 @@ typesafe Haskell namespace identifiers and unsafe "real world" namespaces. module Text.Pandoc.Readers.Odt.Generic.Namespaces where import qualified Data.Map as M +import Data.Text (Text) -- -type NameSpaceIRI = String +type NameSpaceIRI = Text -- type NameSpaceIRIs nsID = M.Map nsID NameSpaceIRI diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs index 6dc56a0d9..edefe3c70 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs @@ -20,7 +20,6 @@ module Text.Pandoc.Readers.Odt.Generic.Utils , reverseComposition , tryToRead , Lookupable(..) -, readLookupables , readLookupable , readPercent , findBy @@ -30,11 +29,11 @@ module Text.Pandoc.Readers.Odt.Generic.Utils import Control.Category (Category, (<<<), (>>>)) import qualified Control.Category as Cat (id) -import Control.Monad (msum) - +import Data.Char (isSpace) import qualified Data.Foldable as F (Foldable, foldr) import Data.Maybe - +import Data.Text (Text) +import qualified Data.Text as T -- | Equivalent to -- > foldr (.) id @@ -76,8 +75,8 @@ swing = flip.(.flip id) -- (nobody wants that) while the latter returns "to much" for simple purposes. -- This function instead applies 'reads' and returns the first match (if any) -- in a 'Maybe'. -tryToRead :: (Read r) => String -> Maybe r -tryToRead = reads >>> listToMaybe >>> fmap fst +tryToRead :: (Read r) => Text -> Maybe r +tryToRead = (reads . T.unpack) >>> listToMaybe >>> fmap fst -- | A version of 'reads' that requires a '%' sign after the number readPercent :: ReadS Int @@ -88,26 +87,12 @@ readPercent s = [ (i,s') | (i , r ) <- reads s -- | Data that can be looked up. -- This is mostly a utility to read data with kind *. class Lookupable a where - lookupTable :: [(String, a)] - --- | The idea is to use this function as if there was a declaration like --- --- > instance (Lookupable a) => (Read a) where --- > readsPrec _ = readLookupables --- . --- But including this code in this form would need UndecideableInstances. --- That is a bad idea. Luckily 'readLookupable' (without the s at the end) --- can be used directly in almost any case. -readLookupables :: (Lookupable a) => String -> [(a,String)] -readLookupables s = [ (a,rest) | (word,rest) <- lex s, - a <- maybeToList (lookup word lookupTable) - ] + lookupTable :: [(Text, a)] -- | Very similar to a simple 'lookup' in the 'lookupTable', but with a lexer. -readLookupable :: (Lookupable a) => String -> Maybe a -readLookupable s = msum - $ map ((`lookup` lookupTable).fst) - $ lex s +readLookupable :: (Lookupable a) => Text -> Maybe a +readLookupable s = + lookup (T.takeWhile (not . isSpace) $ T.dropWhile isSpace s) lookupTable uncurry3 :: (a->b->c -> z) -> (a,b,c ) -> z uncurry4 :: (a->b->c->d -> z) -> (a,b,c,d ) -> z diff --git a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs index 00c636a0d..0d921e23b 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE LambdaCase #-} @@ -60,11 +61,11 @@ import Control.Arrow import Data.Bool ( bool ) import Data.Either ( rights ) import qualified Data.Map as M -import qualified Data.Text as T +import Data.Text (Text) import Data.Default import Data.Maybe -import qualified Text.XML.Light as XML +import qualified Text.Pandoc.XML.Light as XML import Text.Pandoc.Readers.Odt.Arrows.State import Text.Pandoc.Readers.Odt.Arrows.Utils @@ -78,13 +79,13 @@ import Text.Pandoc.Readers.Odt.Generic.Fallible -------------------------------------------------------------------------------- -- -type ElementName = String -type AttributeName = String -type AttributeValue = String -type TextAttributeValue = T.Text +type ElementName = Text +type AttributeName = Text +type AttributeValue = Text +type TextAttributeValue = Text -- -type NameSpacePrefix = String +type NameSpacePrefix = Text -- type NameSpacePrefixes nsID = M.Map nsID NameSpacePrefix @@ -461,7 +462,7 @@ lookupDefaultingAttr :: (NameSpaceID nsID, Lookupable a, Default a) lookupDefaultingAttr nsID attrName = lookupAttrWithDefault nsID attrName def --- | Return value as a (Maybe String) +-- | Return value as a (Maybe Text) findAttr' :: (NameSpaceID nsID) => nsID -> AttributeName -> XMLConverter nsID extraState x (Maybe AttributeValue) @@ -477,7 +478,6 @@ findAttrText' nsID attrName = qualifyName nsID attrName &&& getCurrentElement >>% XML.findAttr - >>^ fmap T.pack -- | Return value as string or fail findAttr :: (NameSpaceID nsID) @@ -492,7 +492,6 @@ findAttrText :: (NameSpaceID nsID) -> FallibleXMLConverter nsID extraState x TextAttributeValue findAttrText nsID attrName = findAttr' nsID attrName - >>^ fmap T.pack >>> maybeToChoice -- | Return value as string or return provided default value @@ -511,7 +510,7 @@ findAttrTextWithDefault :: (NameSpaceID nsID) -> XMLConverter nsID extraState x TextAttributeValue findAttrTextWithDefault nsID attrName deflt = findAttr' nsID attrName - >>^ maybe deflt T.pack + >>^ fromMaybe deflt -- | Read and return value or fail readAttr :: (NameSpaceID nsID, Read attrValue) @@ -748,7 +747,7 @@ matchContent lookups fallback -- Internals -------------------------------------------------------------------------------- -stringToBool' :: String -> Maybe Bool +stringToBool' :: Text -> Maybe Bool stringToBool' val | val `elem` trueValues = Just True | val `elem` falseValues = Just False | otherwise = Nothing diff --git a/src/Text/Pandoc/Readers/Odt/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Namespaces.hs index 3a24a1162..70741c28d 100644 --- a/src/Text/Pandoc/Readers/Odt/Namespaces.hs +++ b/src/Text/Pandoc/Readers/Odt/Namespaces.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Reader.Odt.Namespaces Copyright : Copyright (C) 2015 Martin Linnemann @@ -13,10 +14,10 @@ Namespaces used in odt files. module Text.Pandoc.Readers.Odt.Namespaces ( Namespace (..) ) where -import Data.List (isPrefixOf) import qualified Data.Map as M (empty, insert) import Data.Maybe (fromMaybe, listToMaybe) - +import Data.Text (Text) +import qualified Data.Text as T import Text.Pandoc.Readers.Odt.Generic.Namespaces @@ -30,7 +31,7 @@ instance NameSpaceID Namespace where findID :: NameSpaceIRI -> Maybe Namespace -findID iri = listToMaybe [nsID | (iri',nsID) <- nsIDs, iri' `isPrefixOf` iri] +findID iri = listToMaybe [nsID | (iri',nsID) <- nsIDs, iri' `T.isPrefixOf` iri] nsIDmap :: NameSpaceIRIs Namespace nsIDmap = foldr (uncurry $ flip M.insert) M.empty nsIDs @@ -54,12 +55,12 @@ data Namespace = -- Open Document core -- Core XML (basically only for the 'id'-attribute) | NsXML -- Fallback - | NsOther String + | NsOther Text deriving ( Eq, Ord, Show ) -- | Not the actual iri's, but large prefixes of them - this way there are -- less versioning problems and the like. -nsIDs :: [(String,Namespace)] +nsIDs :: [(Text, Namespace)] nsIDs = [ ("urn:oasis:names:tc:opendocument:xmlns:animation" , NsAnim ), ("urn:oasis:names:tc:opendocument:xmlns:chart" , NsChart ), diff --git a/src/Text/Pandoc/Readers/Odt/StyleReader.hs b/src/Text/Pandoc/Readers/Odt/StyleReader.hs index 46a777df1..5e10f896c 100644 --- a/src/Text/Pandoc/Readers/Odt/StyleReader.hs +++ b/src/Text/Pandoc/Readers/Odt/StyleReader.hs @@ -2,6 +2,7 @@ {-# LANGUAGE Arrows #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TupleSections #-} +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Odt.StyleReader Copyright : Copyright (C) 2015 Martin Linnemann @@ -46,11 +47,13 @@ import qualified Data.Foldable as F import Data.List (unfoldr) import qualified Data.Map as M import Data.Maybe +import Data.Text (Text) +import qualified Data.Text as T import qualified Data.Set as S -import qualified Text.XML.Light as XML +import qualified Text.Pandoc.XML.Light as XML -import Text.Pandoc.Shared (safeRead) +import Text.Pandoc.Shared (safeRead, tshow) import Text.Pandoc.Readers.Odt.Arrows.Utils @@ -90,7 +93,7 @@ instance Default FontPitch where -- -- Thus, we want -type FontFaceName = String +type FontFaceName = Text type FontPitches = M.Map FontFaceName FontPitch @@ -151,7 +154,7 @@ findPitch = ( lookupAttr NsStyle "font-pitch" -- Definitions of main data -------------------------------------------------------------------------------- -type StyleName = String +type StyleName = Text -- | There are two types of styles: named styles with a style family and an -- optional style parent, and default styles for each style family, @@ -355,8 +358,8 @@ getListLevelStyle level ListStyle{..} = -- \^ simpler, but in general less efficient data ListLevelStyle = ListLevelStyle { listLevelType :: ListLevelType - , listItemPrefix :: Maybe String - , listItemSuffix :: Maybe String + , listItemPrefix :: Maybe Text + , listItemSuffix :: Maybe Text , listItemFormat :: ListItemNumberFormat , listItemStart :: Int } @@ -366,9 +369,9 @@ instance Show ListLevelStyle where show ListLevelStyle{..} = " listItemPrefix) ++ show listItemFormat - ++ maybeToString listItemSuffix + ++ maybeToString (T.unpack <$> listItemSuffix) ++ ">" where maybeToString = fromMaybe "" @@ -471,7 +474,7 @@ readTextProperties = ) where isFontEmphasised = [("normal",False),("italic",True),("oblique",True)] isFontBold = ("normal",False):("bold",True) - :map ((,True).show) ([100,200..900]::[Int]) + :map ((,True) . tshow) ([100,200..900]::[Int]) readUnderlineMode :: StyleReaderSafe _x (Maybe UnderlineMode) readUnderlineMode = readLineMode "text-underline-mode" @@ -481,7 +484,7 @@ readStrikeThroughMode :: StyleReaderSafe _x (Maybe UnderlineMode) readStrikeThroughMode = readLineMode "text-line-through-mode" "text-line-through-style" -readLineMode :: String -> String -> StyleReaderSafe _x (Maybe UnderlineMode) +readLineMode :: Text -> Text -> StyleReaderSafe _x (Maybe UnderlineMode) readLineMode modeAttr styleAttr = proc x -> do isUL <- searchAttr NsStyle styleAttr False isLinePresent -< x mode <- lookupAttr' NsStyle modeAttr -< x diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs index da990e4d3..89c71d773 100644 --- a/src/Text/Pandoc/Writers/Docx.hs +++ b/src/Text/Pandoc/Writers/Docx.hs @@ -31,6 +31,7 @@ import qualified Data.Map as M import Data.Maybe (fromMaybe, isNothing, mapMaybe, maybeToList) import qualified Data.Set as Set import qualified Data.Text as T +import Data.Text (Text) import qualified Data.Text.Lazy as TL import Data.Time.Clock.POSIX import Data.Digest.Pure.SHA (sha1, showDigest) @@ -57,19 +58,19 @@ import Text.Pandoc.Writers.Math import Text.Pandoc.Writers.Shared import Text.Printf (printf) import Text.TeXMath -import Text.XML.Light as XML -import Text.XML.Light.Cursor as XMLC import Text.Pandoc.Writers.OOXML +import Text.Pandoc.XML.Light as XML +import Data.Generics (mkT, everywhere) data ListMarker = NoMarker | BulletMarker | NumberMarker ListNumberStyle ListNumberDelim Int deriving (Show, Read, Eq, Ord) -listMarkerToId :: ListMarker -> String +listMarkerToId :: ListMarker -> Text listMarkerToId NoMarker = "990" listMarkerToId BulletMarker = "991" -listMarkerToId (NumberMarker sty delim n) = +listMarkerToId (NumberMarker sty delim n) = T.pack $ '9' : '9' : styNum : delimNum : show n where styNum = case sty of DefaultStyle -> '2' @@ -106,8 +107,8 @@ data WriterEnv = WriterEnv{ envTextProperties :: EnvProps , envListLevel :: Int , envListNumId :: Int , envInDel :: Bool - , envChangesAuthor :: T.Text - , envChangesDate :: T.Text + , envChangesAuthor :: Text + , envChangesDate :: Text , envPrintWidth :: Integer } @@ -125,9 +126,9 @@ defaultWriterEnv = WriterEnv{ envTextProperties = mempty data WriterState = WriterState{ stFootnotes :: [Element] - , stComments :: [([(T.Text, T.Text)], [Inline])] - , stSectionIds :: Set.Set T.Text - , stExternalLinks :: M.Map String String + , stComments :: [([(Text, Text)], [Inline])] + , stSectionIds :: Set.Set Text + , stExternalLinks :: M.Map Text Text , stImages :: M.Map FilePath (String, String, Maybe MimeType, B.ByteString) , stLists :: [ListMarker] , stInsId :: Int @@ -164,18 +165,18 @@ defaultWriterState = WriterState{ type WS m = ReaderT WriterEnv (StateT WriterState m) -renumIdMap :: Int -> [Element] -> M.Map String String +renumIdMap :: Int -> [Element] -> M.Map Text Text renumIdMap _ [] = M.empty renumIdMap n (e:es) | Just oldId <- findAttr (QName "Id" Nothing Nothing) e = - M.insert oldId ("rId" ++ show n) (renumIdMap (n+1) es) + M.insert oldId ("rId" <> tshow n) (renumIdMap (n+1) es) | otherwise = renumIdMap n es -replaceAttr :: (QName -> Bool) -> String -> [XML.Attr] -> [XML.Attr] +replaceAttr :: (QName -> Bool) -> Text -> [XML.Attr] -> [XML.Attr] replaceAttr f val = map $ \a -> if f (attrKey a) then XML.Attr (attrKey a) val else a -renumId :: (QName -> Bool) -> M.Map String String -> Element -> Element +renumId :: (QName -> Bool) -> M.Map Text Text -> Element -> Element renumId f renumMap e | Just oldId <- findAttrBy f e , Just newId <- M.lookup oldId renumMap = @@ -184,18 +185,12 @@ renumId f renumMap e e { elAttribs = attrs' } | otherwise = e -renumIds :: (QName -> Bool) -> M.Map String String -> [Element] -> [Element] +renumIds :: (QName -> Bool) -> M.Map Text Text -> [Element] -> [Element] renumIds f renumMap = map (renumId f renumMap) -findAttrTextBy :: (QName -> Bool) -> Element -> Maybe T.Text -findAttrTextBy x = fmap T.pack . findAttrBy x - -lookupAttrTextBy :: (QName -> Bool) -> [XML.Attr] -> Maybe T.Text -lookupAttrTextBy x = fmap T.pack . lookupAttrBy x - -- | Certain characters are invalid in XML even if escaped. -- See #1992 -stripInvalidChars :: T.Text -> T.Text +stripInvalidChars :: Text -> Text stripInvalidChars = T.filter isValidChar -- | See XML reference @@ -234,11 +229,11 @@ writeDocx opts doc = do -- Gets the template size let mbpgsz = mbsectpr >>= filterElementName (wname (=="pgSz")) - let mbAttrSzWidth = mbpgsz >>= lookupAttrTextBy ((=="w") . qName) . elAttribs + let mbAttrSzWidth = mbpgsz >>= lookupAttrBy ((=="w") . qName) . elAttribs let mbpgmar = mbsectpr >>= filterElementName (wname (=="pgMar")) - let mbAttrMarLeft = mbpgmar >>= lookupAttrTextBy ((=="left") . qName) . elAttribs - let mbAttrMarRight = mbpgmar >>= lookupAttrTextBy ((=="right") . qName) . elAttribs + let mbAttrMarLeft = mbpgmar >>= lookupAttrBy ((=="left") . qName) . elAttribs + let mbAttrMarRight = mbpgmar >>= lookupAttrBy ((=="right") . qName) . elAttribs -- Get the available area (converting the size and the margins to int and -- doing the difference @@ -250,24 +245,21 @@ writeDocx opts doc = do -- styles mblang <- toLang $ getLang opts meta + -- TODO FIXME avoid this generic traversal! + -- lang is in w:docDefaults / w:rPr / w:lang let addLang :: Element -> Element - addLang e = case (\l -> XMLC.toTree . go (T.unpack $ renderLang l) $ - XMLC.fromElement e) <$> mblang of - Just (Elem e') -> e' - _ -> e -- return original - where go :: String -> Cursor -> Cursor - go l cursor = case XMLC.findRec (isLangElt . current) cursor of - Nothing -> cursor - Just t -> XMLC.modifyContent (setval l) t - setval :: String -> Content -> Content - setval l (Elem e') = Elem $ e'{ elAttribs = map (setvalattr l) $ - elAttribs e' } - setval _ x = x - setvalattr :: String -> XML.Attr -> XML.Attr - setvalattr l (XML.Attr qn@(QName "val" _ _) _) = XML.Attr qn l - setvalattr _ x = x - isLangElt (Elem e') = qName (elName e') == "lang" - isLangElt _ = False + addLang = case mblang of + Nothing -> id + Just l -> everywhere (mkT (go (renderLang l))) + where + go :: Text -> Element -> Element + go l e' + | qName (elName e') == "lang" + = e'{ elAttribs = map (setvalattr l) $ elAttribs e' } + | otherwise = e' + + setvalattr l (XML.Attr qn@(QName "val" _ _) _) = XML.Attr qn l + setvalattr _ x = x let stylepath = "word/styles.xml" styledoc <- addLang <$> parseXml refArchive distArchive stylepath @@ -337,12 +329,13 @@ writeDocx opts doc = do -- [Content_Types].xml let mkOverrideNode (part', contentType') = mknode "Override" - [("PartName",part'),("ContentType",contentType')] () + [("PartName", T.pack part') + ,("ContentType", contentType')] () let mkImageOverride (_, imgpath, mbMimeType, _) = - mkOverrideNode ("/word/" ++ imgpath, - maybe "application/octet-stream" T.unpack mbMimeType) + mkOverrideNode ("/word/" <> imgpath, + fromMaybe "application/octet-stream" mbMimeType) let mkMediaOverride imgpath = - mkOverrideNode ('/':imgpath, T.unpack $ getMimeTypeDef imgpath) + mkOverrideNode ("/" <> imgpath, getMimeTypeDef imgpath) let overrides = map mkOverrideNode ( [("/word/webSettings.xml", "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml") @@ -369,13 +362,14 @@ writeDocx opts doc = do ,("/word/footnotes.xml", "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml") ] ++ - map (\x -> (maybe "" ("/word/" ++) $ extractTarget x, + map (\x -> (maybe "" (T.unpack . ("/word/" <>)) (extractTarget x), "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml")) headers ++ - map (\x -> (maybe "" ("/word/" ++) $ extractTarget x, + map (\x -> (maybe "" (T.unpack . ("/word/" <>)) (extractTarget x), "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml")) footers) ++ map mkImageOverride imgs ++ - [ mkMediaOverride (eRelativePath e) | e <- zEntries refArchive - , "word/media/" `isPrefixOf` eRelativePath e ] + [ mkMediaOverride (eRelativePath e) + | e <- zEntries refArchive + , "word/media/" `isPrefixOf` eRelativePath e ] let defaultnodes = [mknode "Default" [("Extension","xml"),("ContentType","application/xml")] (), @@ -421,7 +415,7 @@ writeDocx opts doc = do let renumHeaders = renumIds (\q -> qName q == "Id") idMap headers let renumFooters = renumIds (\q -> qName q == "Id") idMap footers let baserels = baserels' ++ renumHeaders ++ renumFooters - let toImgRel (ident,path,_,_) = mknode "Relationship" [("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"),("Id",ident),("Target",path)] () + let toImgRel (ident,path,_,_) = mknode "Relationship" [("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"),("Id",T.pack ident),("Target",T.pack path)] () let imgrels = map toImgRel imgs let toLinkRel (src,ident) = mknode "Relationship" [("Type","http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"),("Id",ident),("Target",src),("TargetMode","External") ] () let linkrels = map toLinkRel $ M.toList $ stExternalLinks st @@ -489,10 +483,10 @@ writeDocx opts doc = do numbering <- parseXml refArchive distArchive numpath let newNumElts = mkNumbering (stLists st) let pandocAdded e = - case findAttrTextBy ((== "abstractNumId") . qName) e >>= safeRead of + case findAttrBy ((== "abstractNumId") . qName) e >>= safeRead of Just numid -> numid >= (990 :: Int) Nothing -> - case findAttrTextBy ((== "numId") . qName) e >>= safeRead of + case findAttrBy ((== "numId") . qName) e >>= safeRead of Just numid -> numid >= (1000 :: Int) Nothing -> False let oldElts = filter (not . pandocAdded) $ onlyElems (elContent numbering) @@ -514,7 +508,7 @@ writeDocx opts doc = do let extraCoreProps = ["subject","lang","category","description"] let extraCorePropsMap = M.fromList $ zip extraCoreProps ["dc:subject","dc:language","cp:category","dc:description"] - let lookupMetaString' :: T.Text -> Meta -> T.Text + let lookupMetaString' :: Text -> Meta -> Text lookupMetaString' key' meta' = case key' of "description" -> T.intercalate "_x000d_\n" (map stringify $ lookupMetaBlocks "description" meta') @@ -530,21 +524,21 @@ writeDocx opts doc = do : mktnode "dc:creator" [] (T.intercalate "; " (map stringify $ docAuthors meta)) : [ mktnode (M.findWithDefault "" k extraCorePropsMap) [] (lookupMetaString' k meta) | k <- M.keys (unMeta meta), k `elem` extraCoreProps] - ++ mknode "cp:keywords" [] (T.unpack $ T.intercalate ", " keywords) + ++ mknode "cp:keywords" [] (T.intercalate ", " keywords) : (\x -> [ mknode "dcterms:created" [("xsi:type","dcterms:W3CDTF")] x , mknode "dcterms:modified" [("xsi:type","dcterms:W3CDTF")] x - ]) (formatTime defaultTimeLocale "%FT%XZ" utctime) + ]) (T.pack $ formatTime defaultTimeLocale "%FT%XZ" utctime) let docPropsEntry = toEntry docPropsPath epochtime $ renderXml docProps -- docProps/custom.xml - let customProperties :: [(String, String)] - customProperties = [ (T.unpack k, T.unpack $ lookupMetaString k meta) + let customProperties :: [(Text, Text)] + customProperties = [ (k, lookupMetaString k meta) | k <- M.keys (unMeta meta) , k `notElem` (["title", "author", "keywords"] ++ extraCoreProps)] let mkCustomProp (k, v) pid = mknode "property" [("fmtid","{D5CDD505-2E9C-101B-9397-08002B2CF9AE}") - ,("pid", show pid) + ,("pid", tshow pid) ,("name", k)] $ mknode "vt:lpwstr" [] v let customPropsPath = "docProps/custom.xml" let customProps = mknode "Properties" @@ -594,7 +588,8 @@ writeDocx opts doc = do fontTableEntry <- entryFromArchive refArchive "word/fontTable.xml" webSettingsEntry <- entryFromArchive refArchive "word/webSettings.xml" headerFooterEntries <- mapM (entryFromArchive refArchive . ("word/" ++)) $ - mapMaybe extractTarget (headers ++ footers) + mapMaybe (fmap T.unpack . extractTarget) + (headers ++ footers) let miscRelEntries = [ e | e <- zEntries refArchive , "word/_rels/" `isPrefixOf` eRelativePath e , ".xml.rels" `isSuffixOf` eRelativePath e @@ -620,8 +615,8 @@ newParaPropToOpenXml (fromStyleName -> s) = let styleId = T.filter (not . isSpace) s in mknode "w:style" [ ("w:type", "paragraph") , ("w:customStyle", "1") - , ("w:styleId", T.unpack styleId)] - [ mknode "w:name" [("w:val", T.unpack s)] () + , ("w:styleId", styleId)] + [ mknode "w:name" [("w:val", s)] () , mknode "w:basedOn" [("w:val","BodyText")] () , mknode "w:qFormat" [] () ] @@ -631,8 +626,8 @@ newTextPropToOpenXml (fromStyleName -> s) = let styleId = T.filter (not . isSpace) s in mknode "w:style" [ ("w:type", "character") , ("w:customStyle", "1") - , ("w:styleId", T.unpack styleId)] - [ mknode "w:name" [("w:val", T.unpack s)] () + , ("w:styleId", styleId)] + [ mknode "w:name" [("w:val", s)] () , mknode "w:basedOn" [("w:val","BodyTextChar")] () ] @@ -643,13 +638,14 @@ styleToOpenXml sm style = toStyle toktype | hasStyleName (fromString $ show toktype) (smCharStyle sm) = Nothing | otherwise = Just $ mknode "w:style" [("w:type","character"), - ("w:customStyle","1"),("w:styleId",show toktype)] - [ mknode "w:name" [("w:val",show toktype)] () + ("w:customStyle","1"),("w:styleId", tshow toktype)] + [ mknode "w:name" [("w:val", tshow toktype)] () , mknode "w:basedOn" [("w:val","VerbatimChar")] () , mknode "w:rPr" [] $ - [ mknode "w:color" [("w:val",tokCol toktype)] () + [ mknode "w:color" [("w:val", tokCol toktype)] () | tokCol toktype /= "auto" ] ++ - [ mknode "w:shd" [("w:val","clear"),("w:fill",tokBg toktype)] () + [ mknode "w:shd" [("w:val","clear") + ,("w:fill",tokBg toktype)] () | tokBg toktype /= "auto" ] ++ [ mknode "w:b" [] () | tokFeature tokenBold toktype ] ++ [ mknode "w:i" [] () | tokFeature tokenItalic toktype ] ++ @@ -657,10 +653,10 @@ styleToOpenXml sm style = ] tokStyles = tokenStyles style tokFeature f toktype = maybe False f $ M.lookup toktype tokStyles - tokCol toktype = maybe "auto" (drop 1 . fromColor) + tokCol toktype = maybe "auto" (T.pack . drop 1 . fromColor) $ (tokenColor =<< M.lookup toktype tokStyles) `mplus` defaultColor style - tokBg toktype = maybe "auto" (drop 1 . fromColor) + tokBg toktype = maybe "auto" (T.pack . drop 1 . fromColor) $ (tokenBackground =<< M.lookup toktype tokStyles) `mplus` backgroundColor style parStyle | hasStyleName "Source Code" (smParaStyle sm) = Nothing @@ -673,10 +669,11 @@ styleToOpenXml sm style = , mknode "w:pPr" [] $ mknode "w:wordWrap" [("w:val","off")] () : - maybe [] (\col -> [mknode "w:shd" [("w:val","clear"),("w:fill",drop 1 $ fromColor col)] ()]) (backgroundColor style) + maybe [] (\col -> [mknode "w:shd" [("w:val","clear"),("w:fill", T.pack $ drop 1 $ fromColor col)] ()]) (backgroundColor style) ] -copyChildren :: (PandocMonad m) => Archive -> Archive -> String -> Integer -> [String] -> m Entry +copyChildren :: (PandocMonad m) + => Archive -> Archive -> String -> Integer -> [Text] -> m Entry copyChildren refArchive distArchive path timestamp elNames = do ref <- parseXml refArchive distArchive path dist <- parseXml distArchive distArchive path @@ -685,7 +682,7 @@ copyChildren refArchive distArchive path timestamp elNames = do } where strName QName{qName=name, qPrefix=prefix} - | Just p <- prefix = p++":"++name + | Just p <- prefix = p <> ":" <> name | otherwise = name shouldCopy = (`elem` elNames) . strName cleanElem el@Element{elName=name} = Elem el{elName=name{qURI=Nothing}} @@ -706,35 +703,35 @@ maxListLevel = 8 mkNum :: ListMarker -> Int -> Element mkNum marker numid = - mknode "w:num" [("w:numId",show numid)] + mknode "w:num" [("w:numId",tshow numid)] $ mknode "w:abstractNumId" [("w:val",listMarkerToId marker)] () : case marker of NoMarker -> [] BulletMarker -> [] NumberMarker _ _ start -> - map (\lvl -> mknode "w:lvlOverride" [("w:ilvl",show (lvl :: Int))] - $ mknode "w:startOverride" [("w:val",show start)] ()) + map (\lvl -> mknode "w:lvlOverride" [("w:ilvl",tshow (lvl :: Int))] + $ mknode "w:startOverride" [("w:val",tshow start)] ()) [0..maxListLevel] mkAbstractNum :: ListMarker -> Integer -> Element mkAbstractNum marker nsid = mknode "w:abstractNum" [("w:abstractNumId",listMarkerToId marker)] - $ mknode "w:nsid" [("w:val", printf "%8x" nsid)] () + $ mknode "w:nsid" [("w:val", T.pack $ printf "%8x" nsid)] () : mknode "w:multiLevelType" [("w:val","multilevel")] () : map (mkLvl marker) [0..maxListLevel] mkLvl :: ListMarker -> Int -> Element mkLvl marker lvl = - mknode "w:lvl" [("w:ilvl",show lvl)] $ + mknode "w:lvl" [("w:ilvl",tshow lvl)] $ [ mknode "w:start" [("w:val",start)] () | marker /= NoMarker && marker /= BulletMarker ] ++ [ mknode "w:numFmt" [("w:val",fmt)] () - , mknode "w:lvlText" [("w:val",lvltxt)] () + , mknode "w:lvlText" [("w:val", lvltxt)] () , mknode "w:lvlJc" [("w:val","left")] () , mknode "w:pPr" [] - [ mknode "w:ind" [ ("w:left",show $ lvl * step + step) - , ("w:hanging",show (hang :: Int)) + [ mknode "w:ind" [ ("w:left",tshow $ lvl * step + step) + , ("w:hanging",tshow (hang :: Int)) ] () ] ] @@ -743,8 +740,8 @@ mkLvl marker lvl = NoMarker -> ("bullet"," ","1") BulletMarker -> ("bullet",bulletFor lvl,"1") NumberMarker st de n -> (styleFor st lvl - ,patternFor de ("%" ++ show (lvl + 1)) - ,show n) + ,patternFor de ("%" <> tshow (lvl + 1)) + ,tshow n) step = 720 hang = 480 bulletFor 0 = "\x2022" -- filled circle @@ -767,9 +764,9 @@ mkLvl marker lvl = styleFor DefaultStyle 5 = "lowerRoman" styleFor DefaultStyle x = styleFor DefaultStyle (x `mod` 6) styleFor _ _ = "decimal" - patternFor OneParen s = s ++ ")" - patternFor TwoParens s = "(" ++ s ++ ")" - patternFor _ s = s ++ "." + patternFor OneParen s = s <> ")" + patternFor TwoParens s = "(" <> s <> ")" + patternFor _ s = s <> "." getNumId :: (PandocMonad m) => WS m Int getNumId = (((baseListId - 1) +) . length) `fmap` gets stLists @@ -777,8 +774,8 @@ getNumId = (((baseListId - 1) +) . length) `fmap` gets stLists makeTOC :: (PandocMonad m) => WriterOptions -> WS m [Element] makeTOC opts = do - let depth = "1-"++show (writerTOCDepth opts) - let tocCmd = "TOC \\o \""++depth++"\" \\h \\z \\u" + let depth = "1-" <> tshow (writerTOCDepth opts) + let tocCmd = "TOC \\o \"" <> depth <> "\" \\h \\z \\u" tocTitle <- gets stTocTitle title <- withParaPropM (pStyleM "TOC Heading") (blocksToOpenXML opts [Para tocTitle]) return @@ -831,7 +828,7 @@ writeOpenXML opts (Pandoc meta blocks) = do let toComment (kvs, ils) = do annotation <- inlinesToOpenXML opts ils return $ - mknode "w:comment" [('w':':':T.unpack k,T.unpack v) | (k,v) <- kvs] + mknode "w:comment" [("w:" <> k, v) | (k,v) <- kvs] [ mknode "w:p" [] $ map Elem [ mknode "w:pPr" [] @@ -867,24 +864,24 @@ pStyleM :: (PandocMonad m) => ParaStyleName -> WS m XML.Element pStyleM styleName = do pStyleMap <- gets (smParaStyle . stStyleMaps) let sty' = getStyleIdFromName styleName pStyleMap - return $ mknode "w:pStyle" [("w:val", T.unpack $ fromStyleId sty')] () + return $ mknode "w:pStyle" [("w:val", fromStyleId sty')] () rStyleM :: (PandocMonad m) => CharStyleName -> WS m XML.Element rStyleM styleName = do cStyleMap <- gets (smCharStyle . stStyleMaps) let sty' = getStyleIdFromName styleName cStyleMap - return $ mknode "w:rStyle" [("w:val", T.unpack $ fromStyleId sty')] () + return $ mknode "w:rStyle" [("w:val", fromStyleId sty')] () -getUniqueId :: (PandocMonad m) => WS m String +getUniqueId :: (PandocMonad m) => WS m Text -- the + 20 is to ensure that there are no clashes with the rIds -- already in word/document.xml.rel getUniqueId = do n <- gets stCurId modify $ \st -> st{stCurId = n + 1} - return $ show n + return $ tshow n -- | Key for specifying user-defined docx styles. -dynamicStyleKey :: T.Text +dynamicStyleKey :: Text dynamicStyleKey = "custom-style" -- | Convert a Pandoc block element to OpenXML. @@ -979,7 +976,7 @@ blockToOpenXML' opts (Para lst) blockToOpenXML' opts (LineBlock lns) = blockToOpenXML opts $ linesToPara lns blockToOpenXML' _ b@(RawBlock format str) | format == Format "openxml" = return [ - Text (CData CDataRaw (T.unpack str) Nothing) + Text (CData CDataRaw str Nothing) ] | otherwise = do report $ BlockNotRendered b @@ -1036,7 +1033,7 @@ blockToOpenXML' opts (Table _ blkCapt specs thead tbody tfoot) = do let fullrow = 5000 -- 100% specified in pct let rowwidth = fullrow * sum widths let mkgridcol w = mknode "w:gridCol" - [("w:w", show (floor (textwidth * w) :: Integer))] () + [("w:w", tshow (floor (textwidth * w) :: Integer))] () let hasHeader = not $ all null headers modify $ \s -> s { stInTable = False } -- for compatibility with Word <= 2007, we include a val with a bitmask @@ -1054,16 +1051,16 @@ blockToOpenXML' opts (Table _ blkCapt specs thead tbody tfoot) = do mknode "w:tbl" [] ( mknode "w:tblPr" [] ( mknode "w:tblStyle" [("w:val","Table")] () : - mknode "w:tblW" [("w:type", "pct"), ("w:w", show rowwidth)] () : + mknode "w:tblW" [("w:type", "pct"), ("w:w", tshow rowwidth)] () : mknode "w:tblLook" [("w:firstRow",if hasHeader then "1" else "0") ,("w:lastRow","0") ,("w:firstColumn","0") ,("w:lastColumn","0") ,("w:noHBand","0") ,("w:noVBand","0") - ,("w:val", printf "%04x" tblLookVal) + ,("w:val", T.pack $ printf "%04x" tblLookVal) ] () : - [ mknode "w:tblCaption" [("w:val", T.unpack captionStr)] () + [ mknode "w:tblCaption" [("w:val", captionStr)] () | not (null caption) ] ) : mknode "w:tblGrid" [] (if all (==0) widths @@ -1126,7 +1123,7 @@ listItemToOpenXML opts numid (first:rest) = do modify $ \st -> st{ stInList = oldInList } return $ first'' ++ rest'' -alignmentToString :: Alignment -> [Char] +alignmentToString :: Alignment -> Text alignmentToString alignment = case alignment of AlignLeft -> "left" AlignRight -> "right" @@ -1169,8 +1166,8 @@ getParaProps displayMathPara = do listLevel <- asks envListLevel numid <- asks envListNumId let listPr = [mknode "w:numPr" [] - [ mknode "w:ilvl" [("w:val",show listLevel)] () - , mknode "w:numId" [("w:val",show numid)] () ] | listLevel >= 0 && not displayMathPara] + [ mknode "w:ilvl" [("w:val",tshow listLevel)] () + , mknode "w:numId" [("w:val",tshow numid)] () ] | listLevel >= 0 && not displayMathPara] return $ case listPr ++ squashProps props of [] -> [] ps -> [mknode "w:pPr" [] ps] @@ -1185,7 +1182,7 @@ withParaPropM md p = do d <- md withParaProp d p -formattedString :: PandocMonad m => T.Text -> WS m [Element] +formattedString :: PandocMonad m => Text -> WS m [Element] formattedString str = -- properly handle soft hyphens case splitTextBy (=='\173') str of @@ -1194,7 +1191,7 @@ formattedString str = sh <- formattedRun [mknode "w:softHyphen" [] ()] intercalate sh <$> mapM formattedString' ws -formattedString' :: PandocMonad m => T.Text -> WS m [Element] +formattedString' :: PandocMonad m => Text -> WS m [Element] formattedString' str = do inDel <- asks envInDel formattedRun [ mktnode (if inDel then "w:delText" else "w:t") @@ -1226,7 +1223,7 @@ inlineToOpenXML' opts (Span ("",["csl-right-inline"],[]) ils) = mknode "w:r" [] (mknode "w:t" [("xml:space","preserve")] - ("\t" :: String))] ++) + ("\t" :: Text))] ++) <$> inlinesToOpenXML opts ils inlineToOpenXML' opts (Span ("",["csl-indent"],[]) ils) = inlinesToOpenXML opts ils @@ -1236,17 +1233,17 @@ inlineToOpenXML' _ (Span (ident,["comment-start"],kvs) ils) = do let ident' = fromMaybe ident (lookup "id" kvs) kvs' = filter (("id" /=) . fst) kvs modify $ \st -> st{ stComments = (("id",ident'):kvs', ils) : stComments st } - return [ Elem $ mknode "w:commentRangeStart" [("w:id", T.unpack ident')] () ] + return [ Elem $ mknode "w:commentRangeStart" [("w:id", ident')] () ] inlineToOpenXML' _ (Span (ident,["comment-end"],kvs) _) = -- prefer the "id" in kvs, since that is the one produced by the docx -- reader. let ident' = fromMaybe ident (lookup "id" kvs) in return . map Elem $ - [ mknode "w:commentRangeEnd" [("w:id", T.unpack ident')] () + [ mknode "w:commentRangeEnd" [("w:id", ident')] () , mknode "w:r" [] [ mknode "w:rPr" [] [ mknode "w:rStyle" [("w:val", "CommentReference")] () ] - , mknode "w:commentReference" [("w:id", T.unpack ident')] () ] + , mknode "w:commentReference" [("w:id", ident')] () ] ] inlineToOpenXML' opts (Span (ident,classes,kvs) ils) = do stylemod <- case lookup dynamicStyleKey kvs of @@ -1270,8 +1267,8 @@ inlineToOpenXML' opts (Span (ident,classes,kvs) ils) = do defaultAuthor <- asks envChangesAuthor let author = fromMaybe defaultAuthor (lookup "author" kvs) let mdate = lookup "date" kvs - return $ ("w:author", T.unpack author) : - maybe [] (\date -> [("w:date", T.unpack date)]) mdate + return $ ("w:author", author) : + maybe [] (\date -> [("w:date", date)]) mdate insmod <- if "insertion" `elem` classes then do changeAuthorDate <- getChangeAuthorDate @@ -1281,7 +1278,7 @@ inlineToOpenXML' opts (Span (ident,classes,kvs) ils) = do x <- f return [Elem $ mknode "w:ins" - (("w:id", show insId) : changeAuthorDate) x] + (("w:id", tshow insId) : changeAuthorDate) x] else return id delmod <- if "deletion" `elem` classes then do @@ -1291,7 +1288,7 @@ inlineToOpenXML' opts (Span (ident,classes,kvs) ils) = do return $ \f -> local (\env->env{envInDel=True}) $ do x <- f return [Elem $ mknode "w:del" - (("w:id", show delId) : changeAuthorDate) x] + (("w:id", tshow delId) : changeAuthorDate) x] else return id contents <- insmod $ delmod $ dirmod $ stylemod $ pmod $ inlinesToOpenXML opts ils @@ -1322,7 +1319,7 @@ inlineToOpenXML' opts (Strikeout lst) = inlineToOpenXML' _ LineBreak = return [Elem br] inlineToOpenXML' _ il@(RawInline f str) | f == Format "openxml" = return - [Text (CData CDataRaw (T.unpack str) Nothing)] + [Text (CData CDataRaw str Nothing)] | otherwise = do report $ InlineNotRendered il return [] @@ -1335,7 +1332,7 @@ inlineToOpenXML' opts (Math mathType str) = do when (mathType == DisplayMath) setFirstPara res <- (lift . lift) (convertMath writeOMML mathType str) case res of - Right r -> return [Elem r] + Right r -> return [Elem $ fromXLElement r] Left il -> inlineToOpenXML' opts il inlineToOpenXML' opts (Cite _ lst) = inlinesToOpenXML opts lst inlineToOpenXML' opts (Code attrs str) = do @@ -1348,7 +1345,7 @@ inlineToOpenXML' opts (Code attrs str) = do mknode "w:r" [] [ mknode "w:rPr" [] $ maybeToList (lookup toktype tokTypesMap) - , mknode "w:t" [("xml:space","preserve")] (T.unpack tok) ] + , mknode "w:t" [("xml:space","preserve")] tok ] withTextPropM (rStyleM "Verbatim Char") $ if isNothing (writerHighlightStyle opts) then unhighlighted @@ -1365,7 +1362,7 @@ inlineToOpenXML' opts (Note bs) = do let notemarker = mknode "w:r" [] [ mknode "w:rPr" [] footnoteStyle , mknode "w:footnoteRef" [] () ] - let notemarkerXml = RawInline (Format "openxml") $ T.pack $ ppElement notemarker + let notemarkerXml = RawInline (Format "openxml") $ ppElement notemarker let insertNoteRef (Plain ils : xs) = Plain (notemarkerXml : Space : ils) : xs insertNoteRef (Para ils : xs) = Para (notemarkerXml : Space : ils) : xs insertNoteRef xs = Para [notemarkerXml] : xs @@ -1384,17 +1381,17 @@ inlineToOpenXML' opts (Note bs) = do inlineToOpenXML' opts (Link _ txt (T.uncons -> Just ('#', xs),_)) = do contents <- withTextPropM (rStyleM "Hyperlink") $ inlinesToOpenXML opts txt return - [ Elem $ mknode "w:hyperlink" [("w:anchor", T.unpack $ toBookmarkName xs)] contents ] + [ Elem $ mknode "w:hyperlink" [("w:anchor", toBookmarkName xs)] contents ] -- external link: inlineToOpenXML' opts (Link _ txt (src,_)) = do contents <- withTextPropM (rStyleM "Hyperlink") $ inlinesToOpenXML opts txt extlinks <- gets stExternalLinks - id' <- case M.lookup (T.unpack src) extlinks of + id' <- case M.lookup src extlinks of Just i -> return i Nothing -> do - i <- ("rId"++) `fmap` getUniqueId + i <- ("rId" <>) <$> getUniqueId modify $ \st -> st{ stExternalLinks = - M.insert (T.unpack src) i extlinks } + M.insert src i extlinks } return i return [ Elem $ mknode "w:hyperlink" [("r:id",id')] contents ] inlineToOpenXML' opts (Image attr@(imgident, _, _) alt (src, title)) = do @@ -1414,17 +1411,17 @@ inlineToOpenXML' opts (Image attr@(imgident, _, _) alt (src, title)) = do ,("noChangeAspect","1")] () nvPicPr = mknode "pic:nvPicPr" [] [ mknode "pic:cNvPr" - [("descr",T.unpack src),("id","0"),("name","Picture")] () + [("descr",src),("id","0"),("name","Picture")] () , cNvPicPr ] blipFill = mknode "pic:blipFill" [] - [ mknode "a:blip" [("r:embed",ident)] () + [ mknode "a:blip" [("r:embed",T.pack ident)] () , mknode "a:stretch" [] $ mknode "a:fillRect" [] () ] xfrm = mknode "a:xfrm" [] [ mknode "a:off" [("x","0"),("y","0")] () - , mknode "a:ext" [("cx",show xemu) - ,("cy",show yemu)] () ] + , mknode "a:ext" [("cx",tshow xemu) + ,("cy",tshow yemu)] () ] prstGeom = mknode "a:prstGeom" [("prst","rect")] $ mknode "a:avLst" [] () ln = mknode "a:ln" [("w","9525")] @@ -1445,12 +1442,12 @@ inlineToOpenXML' opts (Image attr@(imgident, _, _) alt (src, title)) = do imgElt = mknode "w:r" [] $ mknode "w:drawing" [] $ mknode "wp:inline" [] - [ mknode "wp:extent" [("cx",show xemu),("cy",show yemu)] () + [ mknode "wp:extent" [("cx",tshow xemu),("cy",tshow yemu)] () , mknode "wp:effectExtent" [("b","0"),("l","0"),("r","0"),("t","0")] () , mknode "wp:docPr" - [ ("descr", T.unpack $ stringify alt) - , ("title", T.unpack title) + [ ("descr", stringify alt) + , ("title", title) , ("id","1") , ("name","Picture") ] () @@ -1463,7 +1460,7 @@ inlineToOpenXML' opts (Image attr@(imgident, _, _) alt (src, title)) = do Just imgData -> return [Elem $ generateImgElt imgData] Nothing -> ( do --try (img, mt) <- P.fetchItem src - ident <- ("rId"++) `fmap` getUniqueId + ident <- ("rId" <>) <$> getUniqueId let imgext = case mt >>= extensionFromMimeType of @@ -1477,10 +1474,10 @@ inlineToOpenXML' opts (Image attr@(imgident, _, _) alt (src, title)) = do Just Svg -> ".svg" Just Emf -> ".emf" Nothing -> "" - imgpath = "media/" <> ident <> T.unpack imgext - mbMimeType = mt <|> getMimeType imgpath + imgpath = "media/" <> ident <> imgext + mbMimeType = mt <|> getMimeType (T.unpack imgpath) - imgData = (ident, imgpath, mbMimeType, img) + imgData = (T.unpack ident, T.unpack imgpath, mbMimeType, img) if T.null imgext then -- without an extension there is no rule for content type @@ -1538,20 +1535,20 @@ withDirection x = do , envTextProperties = EnvProps textStyle textProps' } -wrapBookmark :: (PandocMonad m) => T.Text -> [Content] -> WS m [Content] +wrapBookmark :: (PandocMonad m) => Text -> [Content] -> WS m [Content] wrapBookmark "" contents = return contents wrapBookmark ident contents = do id' <- getUniqueId let bookmarkStart = mknode "w:bookmarkStart" [("w:id", id') - ,("w:name", T.unpack $ toBookmarkName ident)] () + ,("w:name", toBookmarkName ident)] () bookmarkEnd = mknode "w:bookmarkEnd" [("w:id", id')] () return $ Elem bookmarkStart : contents ++ [Elem bookmarkEnd] -- Word imposes a 40 character limit on bookmark names and requires -- that they begin with a letter. So we just use a hash of the -- identifier when otherwise we'd have an illegal bookmark name. -toBookmarkName :: T.Text -> T.Text +toBookmarkName :: Text -> Text toBookmarkName s | Just (c, _) <- T.uncons s , isLetter c diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 171ffe582..3f10cb437 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -24,12 +24,13 @@ import Control.Monad.State.Strict (StateT, evalState, evalStateT, get, gets, lift, modify) import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Char8 as B8 -import Data.Char (isAlphaNum, isAscii, isDigit, toLower) +import Data.Char (isAlphaNum, isAscii, isDigit) import Data.List (isInfixOf, isPrefixOf) import qualified Data.Map as M import Data.Maybe (fromMaybe, isNothing, mapMaybe, isJust) import qualified Data.Set as Set -import qualified Data.Text as TS +import qualified Data.Text as T +import Data.Text (Text) import qualified Data.Text.Lazy as TL import Network.HTTP (urlEncode) import System.FilePath (takeExtension, takeFileName, makeRelative) @@ -48,16 +49,13 @@ import Text.Pandoc.Options (EPUBVersion (..), HTMLMathMethod (..), ObfuscationMethod (NoObfuscation), WrapOption (..), WriterOptions (..)) import Text.Pandoc.Shared (makeSections, normalizeDate, renderTags', - safeRead, stringify, trim, uniqueIdent, tshow) + stringify, uniqueIdent, tshow) import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.UUID (getRandomUUID) import Text.Pandoc.Walk (query, walk, walkM) import Text.Pandoc.Writers.HTML (writeHtmlStringForEPUB) import Text.Printf (printf) -import Text.XML.Light (Attr (..), Element (..), Node (..), QName (..), - add_attrs, lookupAttr, node, onlyElems, - ppElement, showElement, strContent, unode, unqual) -import Text.Pandoc.XMLParser (parseXMLContents) +import Text.Pandoc.XML.Light import Text.Pandoc.XML (escapeStringForXML) import Text.DocTemplates (FromContext(lookupContext), Context(..), ToContext(toVal), Val(..)) @@ -69,7 +67,7 @@ newtype Chapter = Chapter [Block] data EPUBState = EPUBState { stMediaPaths :: [(FilePath, (FilePath, Maybe Entry))] , stMediaNextId :: Int - , stEpubSubdir :: String + , stEpubSubdir :: FilePath } type E m = StateT EPUBState m @@ -78,62 +76,63 @@ data EPUBMetadata = EPUBMetadata{ epubIdentifier :: [Identifier] , epubTitle :: [Title] , epubDate :: [Date] - , epubLanguage :: String + , epubLanguage :: Text , epubCreator :: [Creator] , epubContributor :: [Creator] - , epubSubject :: [String] - , epubDescription :: Maybe String - , epubType :: Maybe String - , epubFormat :: Maybe String - , epubPublisher :: Maybe String - , epubSource :: Maybe String - , epubRelation :: Maybe String - , epubCoverage :: Maybe String - , epubRights :: Maybe String - , epubBelongsToCollection :: Maybe String - , epubGroupPosition :: Maybe String - , epubCoverImage :: Maybe String + , epubSubject :: [Text] + , epubDescription :: Maybe Text + , epubType :: Maybe Text + , epubFormat :: Maybe Text + , epubPublisher :: Maybe Text + , epubSource :: Maybe Text + , epubRelation :: Maybe Text + , epubCoverage :: Maybe Text + , epubRights :: Maybe Text + , epubBelongsToCollection :: Maybe Text + , epubGroupPosition :: Maybe Text + , epubCoverImage :: Maybe FilePath , epubStylesheets :: [FilePath] , epubPageDirection :: Maybe ProgressionDirection - , epubIbooksFields :: [(String, String)] - , epubCalibreFields :: [(String, String)] + , epubIbooksFields :: [(Text, Text)] + , epubCalibreFields :: [(Text, Text)] } deriving Show data Date = Date{ - dateText :: String - , dateEvent :: Maybe String + dateText :: Text + , dateEvent :: Maybe Text } deriving Show data Creator = Creator{ - creatorText :: String - , creatorRole :: Maybe String - , creatorFileAs :: Maybe String + creatorText :: Text + , creatorRole :: Maybe Text + , creatorFileAs :: Maybe Text } deriving Show data Identifier = Identifier{ - identifierText :: String - , identifierScheme :: Maybe String + identifierText :: Text + , identifierScheme :: Maybe Text } deriving Show data Title = Title{ - titleText :: String - , titleFileAs :: Maybe String - , titleType :: Maybe String + titleText :: Text + , titleFileAs :: Maybe Text + , titleType :: Maybe Text } deriving Show data ProgressionDirection = LTR | RTL deriving Show -dcName :: String -> QName +dcName :: Text -> QName dcName n = QName n Nothing (Just "dc") -dcNode :: Node t => String -> t -> Element +dcNode :: Node t => Text -> t -> Element dcNode = node . dcName -opfName :: String -> QName +opfName :: Text -> QName opfName n = QName n Nothing (Just "opf") -toId :: FilePath -> String -toId = map (\x -> if isAlphaNum x || x == '-' || x == '_' +toId :: FilePath -> Text +toId = T.pack . + map (\x -> if isAlphaNum x || x == '-' || x == '_' then x else '_') . takeFileName @@ -141,8 +140,8 @@ removeNote :: Inline -> Inline removeNote (Note _) = Str "" removeNote x = x -toVal' :: String -> Val TS.Text -toVal' = toVal . TS.pack +toVal' :: Text -> Val T.Text +toVal' = toVal mkEntry :: PandocMonad m => FilePath -> B.ByteString -> E m Entry mkEntry path content = do @@ -172,21 +171,21 @@ getEPUBMetadata opts meta = do if null (epubIdentifier m) then do randomId <- getRandomUUID - return $ m{ epubIdentifier = [Identifier (show randomId) Nothing] } + return $ m{ epubIdentifier = [Identifier (tshow randomId) Nothing] } else return m let addLanguage m = - if null (epubLanguage m) + if T.null (epubLanguage m) then case lookupContext "lang" (writerVariables opts) of - Just x -> return m{ epubLanguage = TS.unpack x } + Just x -> return m{ epubLanguage = x } Nothing -> do mLang <- lift $ P.lookupEnv "LANG" let localeLang = case mLang of Just lang -> - TS.map (\c -> if c == '_' then '-' else c) $ - TS.takeWhile (/='.') lang + T.map (\c -> if c == '_' then '-' else c) $ + T.takeWhile (/='.') lang Nothing -> "en-US" - return m{ epubLanguage = TS.unpack localeLang } + return m{ epubLanguage = localeLang } else return m let fixDate m = if null (epubDate m) @@ -201,7 +200,7 @@ getEPUBMetadata opts meta = do then return m else do let authors' = map stringify $ docAuthors meta - let toAuthor name = Creator{ creatorText = TS.unpack name + let toAuthor name = Creator{ creatorText = name , creatorRole = Just "aut" , creatorFileAs = Nothing } return $ m{ epubCreator = map toAuthor authors' ++ epubCreator m } @@ -249,31 +248,31 @@ addMetadataFromXML e@(Element (QName name _ (Just "dc")) attrs _ _) md where getAttr n = lookupAttr (opfName n) attrs addMetadataFromXML e@(Element (QName "meta" _ _) attrs _ _) md = case getAttr "property" of - Just s | "ibooks:" `isPrefixOf` s -> - md{ epubIbooksFields = (drop 7 s, strContent e) : + Just s | "ibooks:" `T.isPrefixOf` s -> + md{ epubIbooksFields = (T.drop 7 s, strContent e) : epubIbooksFields md } _ -> case getAttr "name" of - Just s | "calibre:" `isPrefixOf` s -> + Just s | "calibre:" `T.isPrefixOf` s -> md{ epubCalibreFields = - (drop 8 s, fromMaybe "" $ getAttr "content") : + (T.drop 8 s, fromMaybe "" $ getAttr "content") : epubCalibreFields md } _ -> md where getAttr n = lookupAttr (unqual n) attrs addMetadataFromXML _ md = md -metaValueToString :: MetaValue -> String -metaValueToString (MetaString s) = TS.unpack s -metaValueToString (MetaInlines ils) = TS.unpack $ stringify ils -metaValueToString (MetaBlocks bs) = TS.unpack $ stringify bs +metaValueToString :: MetaValue -> Text +metaValueToString (MetaString s) = s +metaValueToString (MetaInlines ils) = stringify ils +metaValueToString (MetaBlocks bs) = stringify bs metaValueToString (MetaBool True) = "true" metaValueToString (MetaBool False) = "false" metaValueToString _ = "" metaValueToPaths :: MetaValue -> [FilePath] -metaValueToPaths (MetaList xs) = map metaValueToString xs -metaValueToPaths x = [metaValueToString x] +metaValueToPaths (MetaList xs) = map (T.unpack . metaValueToString) xs +metaValueToPaths x = [T.unpack $ metaValueToString x] -getList :: TS.Text -> Meta -> (MetaValue -> a) -> [a] +getList :: T.Text -> Meta -> (MetaValue -> a) -> [a] getList s meta handleMetaValue = case lookupMeta s meta of Just (MetaList xs) -> map handleMetaValue xs @@ -297,7 +296,7 @@ getTitle meta = getList "title" meta handleMetaValue , titleType = metaValueToString <$> M.lookup "type" m } handleMetaValue mv = Title (metaValueToString mv) Nothing Nothing -getCreator :: TS.Text -> Meta -> [Creator] +getCreator :: T.Text -> Meta -> [Creator] getCreator s meta = getList s meta handleMetaValue where handleMetaValue (MetaMap m) = Creator{ creatorText = maybe "" metaValueToString $ M.lookup "text" m @@ -305,7 +304,7 @@ getCreator s meta = getList s meta handleMetaValue , creatorRole = metaValueToString <$> M.lookup "role" m } handleMetaValue mv = Creator (metaValueToString mv) Nothing Nothing -getDate :: TS.Text -> Meta -> [Date] +getDate :: T.Text -> Meta -> [Date] getDate s meta = getList s meta handleMetaValue where handleMetaValue (MetaMap m) = Date{ dateText = fromMaybe "" $ @@ -314,7 +313,7 @@ getDate s meta = getList s meta handleMetaValue handleMetaValue mv = Date { dateText = fromMaybe "" $ normalizeDate' $ metaValueToString mv , dateEvent = Nothing } -simpleList :: TS.Text -> Meta -> [String] +simpleList :: T.Text -> Meta -> [Text] simpleList s meta = case lookupMeta s meta of Just (MetaList xs) -> map metaValueToString xs @@ -339,7 +338,7 @@ metadataFromMeta opts meta = EPUBMetadata{ , epubCoverage = coverage , epubRights = rights , epubBelongsToCollection = belongsToCollection - , epubGroupPosition = groupPosition + , epubGroupPosition = groupPosition , epubCoverImage = coverImage , epubStylesheets = stylesheets , epubPageDirection = pageDirection @@ -363,31 +362,30 @@ metadataFromMeta opts meta = EPUBMetadata{ coverage = metaValueToString <$> lookupMeta "coverage" meta rights = metaValueToString <$> lookupMeta "rights" meta belongsToCollection = metaValueToString <$> lookupMeta "belongs-to-collection" meta - groupPosition = metaValueToString <$> lookupMeta "group-position" meta - coverImage = - (TS.unpack <$> lookupContext "epub-cover-image" - (writerVariables opts)) + groupPosition = metaValueToString <$> lookupMeta "group-position" meta + coverImage = T.unpack <$> + lookupContext "epub-cover-image" (writerVariables opts) `mplus` (metaValueToString <$> lookupMeta "cover-image" meta) mCss = lookupMeta "css" meta <|> lookupMeta "stylesheet" meta stylesheets = maybe [] metaValueToPaths mCss ++ case lookupContext "css" (writerVariables opts) of - Just xs -> map TS.unpack xs + Just xs -> map T.unpack xs Nothing -> case lookupContext "css" (writerVariables opts) of - Just x -> [TS.unpack x] + Just x -> [T.unpack x] Nothing -> [] - pageDirection = case map toLower . metaValueToString <$> + pageDirection = case T.toLower . metaValueToString <$> lookupMeta "page-progression-direction" meta of Just "ltr" -> Just LTR Just "rtl" -> Just RTL _ -> Nothing ibooksFields = case lookupMeta "ibooks" meta of Just (MetaMap mp) - -> M.toList $ M.mapKeys TS.unpack $ M.map metaValueToString mp + -> M.toList $ M.map metaValueToString mp _ -> [] calibreFields = case lookupMeta "calibre" meta of Just (MetaMap mp) - -> M.toList $ M.mapKeys TS.unpack $ M.map metaValueToString mp + -> M.toList $ M.map metaValueToString mp _ -> [] -- | Produce an EPUB2 file from a Pandoc document. @@ -413,9 +411,11 @@ writeEPUB :: PandocMonad m writeEPUB epubVersion opts doc = do let epubSubdir = writerEpubSubdirectory opts -- sanity check on epubSubdir - unless (TS.all (\c -> isAscii c && isAlphaNum c) epubSubdir) $ + unless (T.all (\c -> isAscii c && isAlphaNum c) epubSubdir) $ throwError $ PandocEpubSubdirectoryError epubSubdir - let initState = EPUBState { stMediaPaths = [], stMediaNextId = 0, stEpubSubdir = TS.unpack epubSubdir } + let initState = EPUBState { stMediaPaths = [] + , stMediaNextId = 0 + , stEpubSubdir = T.unpack epubSubdir } evalStateT (pandocToEPUB epubVersion opts doc) initState pandocToEPUB :: PandocMonad m @@ -439,7 +439,7 @@ pandocToEPUB version opts doc = do [] -> case epubTitle metadata of [] -> "UNTITLED" (x:_) -> titleText x - x -> TS.unpack $ stringify x + x -> stringify x -- stylesheet stylesheets <- case epubStylesheets metadata of @@ -461,7 +461,8 @@ pandocToEPUB version opts doc = do (ListVal $ map (\e -> toVal' $ (if useprefix then "../" else "") <> - makeRelative epubSubdir (eRelativePath e)) + T.pack + (makeRelative epubSubdir (eRelativePath e))) stylesheetEntries) mempty @@ -490,18 +491,19 @@ pandocToEPUB version opts doc = do case imageSize opts' (B.toStrict imgContent) of Right sz -> return $ sizeInPixels sz Left err' -> (0, 0) <$ report - (CouldNotDetermineImageSize (TS.pack img) err') + (CouldNotDetermineImageSize (T.pack img) err') cpContent <- lift $ writeHtml opts'{ writerVariables = Context (M.fromList [ ("coverpage", toVal' "true"), ("pagetitle", toVal $ - escapeStringForXML $ TS.pack plainTitle), - ("cover-image", toVal' coverImageName), + escapeStringForXML plainTitle), + ("cover-image", + toVal' $ T.pack coverImageName), ("cover-image-width", toVal' $ - show coverImageWidth), + tshow coverImageWidth), ("cover-image-height", toVal' $ - show coverImageHeight)]) <> + tshow coverImageHeight)]) <> cssvars True <> vars } (Pandoc meta []) coverEntry <- mkEntry "text/cover.xhtml" cpContent @@ -517,7 +519,7 @@ pandocToEPUB version opts doc = do ("titlepage", toVal' "true"), ("body-type", toVal' "frontmatter"), ("pagetitle", toVal $ - escapeStringForXML $ TS.pack plainTitle)]) + escapeStringForXML plainTitle)]) <> cssvars True <> vars } (Pandoc meta []) tpEntry <- mkEntry "text/title_page.xhtml" tpContent @@ -526,7 +528,7 @@ pandocToEPUB version opts doc = do let matchingGlob f = do xs <- lift $ P.glob f when (null xs) $ - report $ CouldNotFetchResource (TS.pack f) "glob did not match any font files" + report $ CouldNotFetchResource (T.pack f) "glob did not match any font files" return xs let mkFontEntry f = mkEntry ("fonts/" ++ takeFileName f) =<< lift (P.readFileLazy f) @@ -573,13 +575,13 @@ pandocToEPUB version opts doc = do let chapters' = secsToChapters secs - let extractLinkURL' :: Int -> Inline -> [(TS.Text, TS.Text)] + let extractLinkURL' :: Int -> Inline -> [(T.Text, T.Text)] extractLinkURL' num (Span (ident, _, _) _) - | not (TS.null ident) = [(ident, TS.pack (showChapter num) <> "#" <> ident)] + | not (T.null ident) = [(ident, showChapter num <> "#" <> ident)] extractLinkURL' num (Link (ident, _, _) _ _) - | not (TS.null ident) = [(ident, TS.pack (showChapter num) <> "#" <> ident)] + | not (T.null ident) = [(ident, showChapter num <> "#" <> ident)] extractLinkURL' num (Image (ident, _, _) _ _) - | not (TS.null ident) = [(ident, TS.pack (showChapter num) <> "#" <> ident)] + | not (T.null ident) = [(ident, showChapter num <> "#" <> ident)] extractLinkURL' num (RawInline fmt raw) | isHtmlFormat fmt = foldr (\tag -> @@ -587,18 +589,18 @@ pandocToEPUB version opts doc = do TagOpen{} -> case fromAttrib "id" tag of "" -> id - x -> ((x, TS.pack (showChapter num) <> "#" <> x):) + x -> ((x, showChapter num <> "#" <> x):) _ -> id) [] (parseTags raw) extractLinkURL' _ _ = [] - let extractLinkURL :: Int -> Block -> [(TS.Text, TS.Text)] + let extractLinkURL :: Int -> Block -> [(T.Text, T.Text)] extractLinkURL num (Div (ident, _, _) _) - | not (TS.null ident) = [(ident, TS.pack (showChapter num) <> "#" <> ident)] + | not (T.null ident) = [(ident, showChapter num <> "#" <> ident)] extractLinkURL num (Header _ (ident, _, _) _) - | not (TS.null ident) = [(ident, TS.pack (showChapter num) <> "#" <> ident)] + | not (T.null ident) = [(ident, showChapter num <> "#" <> ident)] extractLinkURL num (Table (ident,_,_) _ _ _ _ _) - | not (TS.null ident) = [(ident, TS.pack (showChapter num) <> "#" <> ident)] + | not (T.null ident) = [(ident, showChapter num <> "#" <> ident)] extractLinkURL num (RawBlock fmt raw) | isHtmlFormat fmt = foldr (\tag -> @@ -606,7 +608,7 @@ pandocToEPUB version opts doc = do TagOpen{} -> case fromAttrib "id" tag of "" -> id - x -> ((x, TS.pack (showChapter num) <> "#" <> x):) + x -> ((x, showChapter num <> "#" <> x):) _ -> id) [] (parseTags raw) extractLinkURL num b = query (extractLinkURL' num) b @@ -617,7 +619,7 @@ pandocToEPUB version opts doc = do let fixInternalReferences :: Inline -> Inline fixInternalReferences (Link attr lab (src, tit)) - | Just ('#', xs) <- TS.uncons src = case lookup xs reftable of + | Just ('#', xs) <- T.uncons src = case lookup xs reftable of Just ys -> Link attr lab (ys, tit) Nothing -> Link attr lab (src, tit) fixInternalReferences x = x @@ -630,7 +632,7 @@ pandocToEPUB version opts doc = do chapters' let chapToEntry num (Chapter bs) = - mkEntry ("text/" ++ showChapter num) =<< + mkEntry ("text/" ++ T.unpack (showChapter num)) =<< writeHtml opts'{ writerVariables = Context (M.fromList [("body-type", toVal' bodyType), @@ -677,12 +679,12 @@ pandocToEPUB version opts doc = do let chapterNode ent = unode "item" ! ([("id", toId $ makeRelative epubSubdir $ eRelativePath ent), - ("href", makeRelative epubSubdir + ("href", T.pack $ makeRelative epubSubdir $ eRelativePath ent), ("media-type", "application/xhtml+xml")] ++ case props ent of [] -> [] - xs -> [("properties", unwords xs)]) + xs -> [("properties", T.unwords xs)]) $ () let chapterRefNode ent = unode "itemref" ! @@ -691,17 +693,17 @@ pandocToEPUB version opts doc = do let pictureNode ent = unode "item" ! [("id", toId $ makeRelative epubSubdir $ eRelativePath ent), - ("href", makeRelative epubSubdir + ("href", T.pack $ makeRelative epubSubdir $ eRelativePath ent), ("media-type", - maybe "application/octet-stream" TS.unpack + fromMaybe "application/octet-stream" $ mediaTypeOf $ eRelativePath ent)] $ () let fontNode ent = unode "item" ! [("id", toId $ makeRelative epubSubdir $ eRelativePath ent), - ("href", makeRelative epubSubdir + ("href", T.pack $ makeRelative epubSubdir $ eRelativePath ent), - ("media-type", maybe "" TS.unpack $ + ("media-type", fromMaybe "" $ getMimeType $ eRelativePath ent)] $ () let tocTitle = maybe plainTitle @@ -710,7 +712,7 @@ pandocToEPUB version opts doc = do (x:_) -> return $ identifierText x -- use first identifier as UUID [] -> throwError $ PandocShouldNeverHappenError "epubIdentifier is null" -- shouldn't happen currentTime <- lift P.getTimestamp - let contentsData = UTF8.fromStringLazy $ ppTopElement $ + let contentsData = UTF8.fromTextLazy $ TL.fromStrict $ ppTopElement $ unode "package" ! ([("version", case version of EPUB2 -> "2.0" @@ -728,7 +730,8 @@ pandocToEPUB version opts doc = do ,("media-type","application/xhtml+xml")] ++ [("properties","nav") | epub3 ]) $ () ] ++ - [ unode "item" ! [("id","stylesheet" ++ show n), ("href",fp) + [ unode "item" ! [("id","stylesheet" <> tshow n) + , ("href", T.pack fp) ,("media-type","text/css")] $ () | (n :: Int, fp) <- zip [1..] (map (makeRelative epubSubdir . eRelativePath) @@ -773,7 +776,7 @@ pandocToEPUB version opts doc = do let tocLevel = writerTOCDepth opts let navPointNode :: PandocMonad m - => (Int -> [Inline] -> TS.Text -> [Element] -> Element) + => (Int -> [Inline] -> T.Text -> [Element] -> Element) -> Block -> StateT Int m [Element] navPointNode formatter (Div (ident,_,_) (Header lvl (_,_,kvs) ils : children)) = @@ -783,7 +786,7 @@ pandocToEPUB version opts doc = do n <- get modify (+1) let num = fromMaybe "" $ lookup "number" kvs - let tit = if writerNumberSections opts && not (TS.null num) + let tit = if writerNumberSections opts && not (T.null num) then Span ("", ["section-header-number"], []) [Str num] : Space : ils else ils @@ -797,21 +800,21 @@ pandocToEPUB version opts doc = do concat <$> mapM (navPointNode formatter) bs navPointNode _ _ = return [] - let navMapFormatter :: Int -> [Inline] -> TS.Text -> [Element] -> Element + let navMapFormatter :: Int -> [Inline] -> T.Text -> [Element] -> Element navMapFormatter n tit src subs = unode "navPoint" ! - [("id", "navPoint-" ++ show n)] $ - [ unode "navLabel" $ unode "text" $ TS.unpack $ stringify tit - , unode "content" ! [("src", "text/" <> TS.unpack src)] $ () + [("id", "navPoint-" <> tshow n)] $ + [ unode "navLabel" $ unode "text" $ stringify tit + , unode "content" ! [("src", "text/" <> src)] $ () ] ++ subs let tpNode = unode "navPoint" ! [("id", "navPoint-0")] $ - [ unode "navLabel" $ unode "text" (TS.unpack $ stringify $ docTitle' meta) + [ unode "navLabel" $ unode "text" (stringify $ docTitle' meta) , unode "content" ! [("src", "text/title_page.xhtml")] $ () ] navMap <- lift $ evalStateT (concat <$> mapM (navPointNode navMapFormatter) secs) 1 - let tocData = UTF8.fromStringLazy $ ppTopElement $ + let tocData = B.fromStrict $ UTF8.fromText $ ppTopElement $ unode "ncx" ! [("version","2005-1") ,("xmlns","http://www.daisy.org/z3986/2005/ncx/")] $ [ unode "head" $ @@ -833,11 +836,11 @@ pandocToEPUB version opts doc = do ] tocEntry <- mkEntry "toc.ncx" tocData - let navXhtmlFormatter :: Int -> [Inline] -> TS.Text -> [Element] -> Element + let navXhtmlFormatter :: Int -> [Inline] -> T.Text -> [Element] -> Element navXhtmlFormatter n tit src subs = unode "li" ! - [("id", "toc-li-" ++ show n)] $ + [("id", "toc-li-" <> tshow n)] $ (unode "a" ! - [("href", "text/" <> TS.unpack src)] + [("href", "text/" <> src)] $ titElements) : case subs of [] -> [] @@ -850,7 +853,7 @@ pandocToEPUB version opts doc = do , writerVariables = Context (M.fromList [("pagetitle", toVal $ - escapeStringForXML $ TS.pack plainTitle)]) + escapeStringForXML plainTitle)]) <> writerVariables opts} (Pandoc nullMeta [Plain $ walk clean tit])) of @@ -865,7 +868,7 @@ pandocToEPUB version opts doc = do tocBlocks <- lift $ evalStateT (concat <$> mapM (navPointNode navXhtmlFormatter) secs) 1 let navBlocks = [RawBlock (Format "html") - $ TS.pack $ showElement $ -- prettyprinting introduces bad spaces + $ showElement $ -- prettyprinting introduces bad spaces unode navtag ! ([("epub:type","toc") | epub3] ++ [("id","toc")]) $ [ unode "h1" ! [("id","toc-title")] $ tocTitle @@ -875,21 +878,21 @@ pandocToEPUB version opts doc = do [ unode "a" ! [("href", "text/title_page.xhtml") ,("epub:type", "titlepage")] $ - ("Title Page" :: String) ] : + ("Title Page" :: Text) ] : [ unode "li" [ unode "a" ! [("href", "text/cover.xhtml") ,("epub:type", "cover")] $ - ("Cover" :: String)] | + ("Cover" :: Text)] | isJust (epubCoverImage metadata) ] ++ [ unode "li" [ unode "a" ! [("href", "#toc") ,("epub:type", "toc")] $ - ("Table of Contents" :: String) + ("Table of Contents" :: Text) ] | writerTableOfContents opts ] else [] - let landmarks = [RawBlock (Format "html") $ TS.pack $ ppElement $ + let landmarks = [RawBlock (Format "html") $ ppElement $ unode "nav" ! [("epub:type","landmarks") ,("id","landmarks") ,("hidden","hidden")] $ @@ -910,22 +913,22 @@ pandocToEPUB version opts doc = do UTF8.fromStringLazy "application/epub+zip" -- container.xml - let containerData = UTF8.fromStringLazy $ ppTopElement $ + let containerData = B.fromStrict $ UTF8.fromText $ ppTopElement $ unode "container" ! [("version","1.0") ,("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")] $ unode "rootfiles" $ unode "rootfile" ! [("full-path", (if null epubSubdir then "" - else epubSubdir ++ "/") ++ "content.opf") + else T.pack epubSubdir <> "/") <> "content.opf") ,("media-type","application/oebps-package+xml")] $ () containerEntry <- mkEntry "META-INF/container.xml" containerData -- com.apple.ibooks.display-options.xml - let apple = UTF8.fromStringLazy $ ppTopElement $ + let apple = B.fromStrict $ UTF8.fromText $ ppTopElement $ unode "display_options" $ unode "platform" ! [("name","*")] $ - unode "option" ! [("name","specified-fonts")] $ ("true" :: String) + unode "option" ! [("name","specified-fonts")] $ ("true" :: Text) appleEntry <- mkEntry "META-INF/com.apple.ibooks.display-options.xml" apple -- construct archive @@ -947,7 +950,8 @@ metadataElement version md currentTime = ++ publisherNodes ++ sourceNodes ++ relationNodes ++ coverageNodes ++ rightsNodes ++ coverImageNodes ++ modifiedNodes ++ belongsToCollectionNodes - withIds base f = concat . zipWith f (map (\x -> base ++ ('-' : show x)) + withIds base f = concat . zipWith f (map (\x -> base <> + T.cons '-' (tshow x)) ([1..] :: [Int])) identifierNodes = withIds "epub-id" toIdentifierNode $ epubIdentifier md @@ -961,9 +965,9 @@ metadataElement version md currentTime = (x:_) -> [dcNode "date" ! [("id","epub-date")] $ dateText x] ibooksNodes = map ibooksNode (epubIbooksFields md) - ibooksNode (k, v) = unode "meta" ! [("property", "ibooks:" ++ k)] $ v + ibooksNode (k, v) = unode "meta" ! [("property", "ibooks:" <> k)] $ v calibreNodes = map calibreNode (epubCalibreFields md) - calibreNode (k, v) = unode "meta" ! [("name", "calibre:" ++ k), + calibreNode (k, v) = unode "meta" ! [("name", "calibre:" <> k), ("content", v)] $ () languageNodes = [dcTag "language" $ epubLanguage md] creatorNodes = withIds "epub-creator" (toCreatorNode "creator") $ @@ -989,12 +993,12 @@ metadataElement version md currentTime = maybe [] (\belongsToCollection -> (unode "meta" ! [("property", "belongs-to-collection"), ("id", "epub-id-1")] $ belongsToCollection ) : - [unode "meta" ! [("refines", "#epub-id-1"), ("property", "collection-type")] $ ("series" :: String) ]) + [unode "meta" ! [("refines", "#epub-id-1"), ("property", "collection-type")] $ ("series" :: Text) ]) (epubBelongsToCollection md)++ maybe [] (\groupPosition -> [unode "meta" ! [("refines", "#epub-id-1"), ("property", "group-position")] $ groupPosition ]) (epubGroupPosition md) - dcTag n s = unode ("dc:" ++ n) s + dcTag n s = unode ("dc:" <> n) s dcTag' n s = [dcTag n s] toIdentifierNode id' (Identifier txt scheme) | version == EPUB2 = [dcNode "identifier" ! @@ -1002,7 +1006,7 @@ metadataElement version md currentTime = txt] | otherwise = (dcNode "identifier" ! [("id",id')] $ txt) : maybe [] ((\x -> [unode "meta" ! - [ ("refines",'#':id') + [ ("refines","#" <> id') , ("property","identifier-type") , ("scheme","onix:codelist5") ] @@ -1018,10 +1022,10 @@ metadataElement version md currentTime = (creatorRole creator >>= toRelator)) $ creatorText creator] | otherwise = [dcNode s ! [("id",id')] $ creatorText creator] ++ maybe [] (\x -> [unode "meta" ! - [("refines",'#':id'),("property","file-as")] $ x]) + [("refines","#" <> id'),("property","file-as")] $ x]) (creatorFileAs creator) ++ maybe [] (\x -> [unode "meta" ! - [("refines",'#':id'),("property","role"), + [("refines","#" <> id'),("property","role"), ("scheme","marc:relators")] $ x]) (creatorRole creator >>= toRelator) toTitleNode id' title @@ -1033,16 +1037,16 @@ metadataElement version md currentTime = | otherwise = [dcNode "title" ! [("id",id')] $ titleText title] ++ maybe [] (\x -> [unode "meta" ! - [("refines",'#':id'),("property","file-as")] $ x]) + [("refines","#" <> id'),("property","file-as")] $ x]) (titleFileAs title) ++ maybe [] (\x -> [unode "meta" ! - [("refines",'#':id'),("property","title-type")] $ x]) + [("refines","#" <> id'),("property","title-type")] $ x]) (titleType title) toDateNode id' date = [dcNode "date" ! (("id",id') : maybe [] (\x -> [("opf:event",x)]) (dateEvent date)) $ dateText date] - schemeToOnix :: String -> String + schemeToOnix :: Text -> Text schemeToOnix "ISBN-10" = "02" schemeToOnix "GTIN-13" = "03" schemeToOnix "UPC" = "04" @@ -1060,48 +1064,48 @@ metadataElement version md currentTime = schemeToOnix "OLCC" = "28" schemeToOnix _ = "01" -showDateTimeISO8601 :: UTCTime -> String -showDateTimeISO8601 = formatTime defaultTimeLocale "%FT%TZ" +showDateTimeISO8601 :: UTCTime -> Text +showDateTimeISO8601 = T.pack . formatTime defaultTimeLocale "%FT%TZ" transformTag :: PandocMonad m - => Tag TS.Text - -> E m (Tag TS.Text) + => Tag T.Text + -> E m (Tag T.Text) transformTag tag@(TagOpen name attr) | name `elem` ["video", "source", "img", "audio"] && isNothing (lookup "data-external" attr) = do let src = fromAttrib "src" tag let poster = fromAttrib "poster" tag - newsrc <- modifyMediaRef $ TS.unpack src - newposter <- modifyMediaRef $ TS.unpack poster + newsrc <- modifyMediaRef $ T.unpack src + newposter <- modifyMediaRef $ T.unpack poster let attr' = filter (\(x,_) -> x /= "src" && x /= "poster") attr ++ - [("src", "../" <> newsrc) | not (TS.null newsrc)] ++ - [("poster", "../" <> newposter) | not (TS.null newposter)] + [("src", "../" <> newsrc) | not (T.null newsrc)] ++ + [("poster", "../" <> newposter) | not (T.null newposter)] return $ TagOpen name attr' transformTag tag = return tag modifyMediaRef :: PandocMonad m => FilePath - -> E m TS.Text + -> E m T.Text modifyMediaRef "" = return "" modifyMediaRef oldsrc = do media <- gets stMediaPaths case lookup oldsrc media of - Just (n,_) -> return $ TS.pack n + Just (n,_) -> return $ T.pack n Nothing -> catchError - (do (img, mbMime) <- P.fetchItem $ TS.pack oldsrc - let ext = maybe (takeExtension (takeWhile (/='?') oldsrc)) TS.unpack + (do (img, mbMime) <- P.fetchItem $ T.pack oldsrc + let ext = maybe (takeExtension (takeWhile (/='?') oldsrc)) T.unpack (("." <>) <$> (mbMime >>= extensionFromMimeType)) newName <- getMediaNextNewName ext let newPath = "media/" ++ newName entry <- mkEntry newPath (B.fromChunks . (:[]) $ img) modify $ \st -> st{ stMediaPaths = (oldsrc, (newPath, Just entry)):media} - return $ TS.pack newPath) + return $ T.pack newPath) (\e -> do - report $ CouldNotFetchResource (TS.pack oldsrc) (tshow e) - return $ TS.pack oldsrc) + report $ CouldNotFetchResource (T.pack oldsrc) (tshow e) + return $ T.pack oldsrc) -getMediaNextNewName :: PandocMonad m => String -> E m String +getMediaNextNewName :: PandocMonad m => FilePath -> E m FilePath getMediaNextNewName ext = do nextId <- gets stMediaNextId modify $ \st -> st { stMediaNextId = nextId + 1 } @@ -1128,11 +1132,11 @@ transformInline :: PandocMonad m -> Inline -> E m Inline transformInline _opts (Image attr lab (src,tit)) = do - newsrc <- modifyMediaRef $ TS.unpack src + newsrc <- modifyMediaRef $ T.unpack src return $ Image attr lab ("../" <> newsrc, tit) transformInline opts x@(Math t m) | WebTeX url <- writerHTMLMathMethod opts = do - newsrc <- modifyMediaRef (TS.unpack url <> urlEncode (TS.unpack m)) + newsrc <- modifyMediaRef (T.unpack url <> urlEncode (T.unpack m)) let mathclass = if t == DisplayMath then "display" else "inline" return $ Span ("",["math",mathclass],[]) [Image nullAttr [x] ("../" <> newsrc, "")] @@ -1143,40 +1147,26 @@ transformInline _opts (RawInline fmt raw) return $ RawInline fmt (renderTags' tags') transformInline _ x = return x -(!) :: (t -> Element) -> [(String, String)] -> t -> Element +(!) :: (t -> Element) -> [(Text, Text)] -> t -> Element (!) f attrs n = add_attrs (map (\(k,v) -> Attr (unqual k) v) attrs) (f n) --- | Version of 'ppTopElement' that specifies UTF-8 encoding. -ppTopElement :: Element -> String -ppTopElement = ("\n" ++) . unEntity . ppElement - -- unEntity removes numeric entities introduced by ppElement - -- (kindlegen seems to choke on these). - where unEntity [] = "" - unEntity ('&':'#':xs) = - let (ds,ys) = break (==';') xs - rest = drop 1 ys - in case safeRead (TS.pack $ "'\\" <> ds <> "'") of - Just x -> x : unEntity rest - Nothing -> '&':'#':unEntity xs - unEntity (x:xs) = x : unEntity xs - mediaTypeOf :: FilePath -> Maybe MimeType mediaTypeOf x = let mediaPrefixes = ["image", "video", "audio"] in case getMimeType x of - Just y | any (`TS.isPrefixOf` y) mediaPrefixes -> Just y + Just y | any (`T.isPrefixOf` y) mediaPrefixes -> Just y _ -> Nothing -- Returns filename for chapter number. -showChapter :: Int -> String -showChapter = printf "ch%03d.xhtml" +showChapter :: Int -> Text +showChapter = T.pack . printf "ch%03d.xhtml" -- Add identifiers to any headers without them. addIdentifiers :: WriterOptions -> [Block] -> [Block] addIdentifiers opts bs = evalState (mapM go bs) Set.empty where go (Header n (ident,classes,kvs) ils) = do ids <- get - let ident' = if TS.null ident + let ident' = if T.null ident then uniqueIdent (writerExtensions opts) ils ids else ident modify $ Set.insert ident' @@ -1184,27 +1174,27 @@ addIdentifiers opts bs = evalState (mapM go bs) Set.empty go x = return x -- Variant of normalizeDate that allows partial dates: YYYY, YYYY-MM -normalizeDate' :: String -> Maybe String -normalizeDate' = fmap TS.unpack . go . trim . TS.pack +normalizeDate' :: Text -> Maybe Text +normalizeDate' = go . T.strip where go xs - | TS.length xs == 4 -- YYY - , TS.all isDigit xs = Just xs - | (y, s) <- TS.splitAt 4 xs -- YYY-MM - , Just ('-', m) <- TS.uncons s - , TS.length m == 2 - , TS.all isDigit y && TS.all isDigit m = Just xs + | T.length xs == 4 -- YYY + , T.all isDigit xs = Just xs + | (y, s) <- T.splitAt 4 xs -- YYY-MM + , Just ('-', m) <- T.uncons s + , T.length m == 2 + , T.all isDigit y && T.all isDigit m = Just xs | otherwise = normalizeDate xs -toRelator :: String -> Maybe String +toRelator :: Text -> Maybe Text toRelator x | x `elem` relators = Just x - | otherwise = lookup (map toLower x) relatorMap + | otherwise = lookup (T.toLower x) relatorMap -relators :: [String] +relators :: [Text] relators = map snd relatorMap -relatorMap :: [(String, String)] +relatorMap :: [(Text, Text)] relatorMap = [("abridger", "abr") ,("actor", "act") diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs index 9334d6e9a..3b5d04427 100644 --- a/src/Text/Pandoc/Writers/FB2.hs +++ b/src/Text/Pandoc/Writers/FB2.hs @@ -25,15 +25,12 @@ import Data.ByteString.Base64 (encode) import Data.Char (isAscii, isControl, isSpace) import Data.Either (lefts, rights) import Data.List (intercalate) -import Data.Text (Text, pack) +import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Lazy as TL import qualified Data.Text.Encoding as TE import Network.HTTP (urlEncode) -import Text.XML.Light -import qualified Text.XML.Light as X -import qualified Text.XML.Light.Cursor as XC -import Text.Pandoc.XMLParser (parseXMLContents) +import Text.Pandoc.XML.Light as X import Text.Pandoc.Class.PandocMonad (PandocMonad, report) import qualified Text.Pandoc.Class.PandocMonad as P @@ -44,6 +41,7 @@ import Text.Pandoc.Options (HTMLMathMethod (..), WriterOptions (..), def) import Text.Pandoc.Shared (capitalize, isURI, orderedListMarkers, makeSections, tshow, stringify) import Text.Pandoc.Writers.Shared (lookupMetaString, toLegacyTable) +import Data.Generics (everywhere, mkT) -- | Data to be written at the end of the document: -- (foot)notes, URLs, references, images. @@ -88,7 +86,7 @@ pandocToFB2 opts (Pandoc meta blocks) = do (imgs,missing) <- get >>= (lift . fetchImages . imagesToFetch) let body' = replaceImagesWithAlt missing body let fb2_xml = el "FictionBook" (fb2_attrs, [desc, body'] ++ notes ++ imgs) - return $ pack $ xml_head ++ showContent fb2_xml ++ "\n" + return $ xml_head <> showContent fb2_xml <> "\n" where xml_head = "\n" fb2_attrs = @@ -100,8 +98,8 @@ pandocToFB2 opts (Pandoc meta blocks) = do description :: PandocMonad m => Meta -> FBM m Content description meta' = do let genre = case lookupMetaString "genre" meta' of - "" -> el "genre" ("unrecognised" :: String) - s -> el "genre" (T.unpack s) + "" -> el "genre" ("unrecognised" :: Text) + s -> el "genre" s bt <- booktitle meta' let as = authors meta' dd <- docdate meta' @@ -112,7 +110,7 @@ description meta' = do Just (MetaInlines [Str s]) -> [el "lang" $ iso639 s] Just (MetaString s) -> [el "lang" $ iso639 s] _ -> [] - where iso639 = T.unpack . T.takeWhile (/= '-') -- Convert BCP 47 to ISO 639 + where iso639 = T.takeWhile (/= '-') -- Convert BCP 47 to ISO 639 let coverimage url = do let img = Image nullAttr mempty (url, "") im <- insertImage InlineImage img @@ -124,7 +122,7 @@ description meta' = do return $ el "description" [ el "title-info" (genre : (as ++ bt ++ annotation ++ dd ++ coverpage ++ lang)) - , el "document-info" [el "program-used" ("pandoc" :: String)] + , el "document-info" [el "program-used" ("pandoc" :: Text)] ] booktitle :: PandocMonad m => Meta -> FBM m [Content] @@ -137,15 +135,15 @@ authors meta' = cMap author (docAuthors meta') author :: [Inline] -> [Content] author ss = - let ws = words . cMap plain $ ss - email = el "email" <$> take 1 (filter ('@' `elem`) ws) - ws' = filter ('@' `notElem`) ws + let ws = T.words $ mconcat $ map plain ss + email = el "email" <$> take 1 (filter (T.any (=='@')) ws) + ws' = filter (not . T.any (== '@')) ws names = case ws' of [nickname] -> [ el "nickname" nickname ] [fname, lname] -> [ el "first-name" fname , el "last-name" lname ] (fname:rest) -> [ el "first-name" fname - , el "middle-name" (concat . init $ rest) + , el "middle-name" (T.concat . init $ rest) , el "last-name" (last rest) ] [] -> [] in list $ el "author" (names ++ email) @@ -206,7 +204,7 @@ renderFootnotes = do el "body" ([uattr "name" "notes"], map renderFN (reverse fns)) where renderFN (n, idstr, cs) = - let fn_texts = el "title" (el "p" (show n)) : cs + let fn_texts = el "title" (el "p" (tshow n)) : cs in el "section" ([uattr "id" idstr], fn_texts) -- | Fetch images and encode them for the FictionBook XML. @@ -282,7 +280,7 @@ isMimeType s = where types = ["text","image","audio","video","application","message","multipart"] valid c = isAscii c && not (isControl c) && not (isSpace c) && - c `notElem` ("()<>@,;:\\\"/[]?=" :: String) + c `notElem` ("()<>@,;:\\\"/[]?=" :: [Char]) footnoteID :: Int -> Text footnoteID i = "n" <> tshow i @@ -306,7 +304,7 @@ blockToXml (Para [Image atr alt (src,tgt)]) = insertImage NormalImage (Image atr alt (src,tit)) blockToXml (Para ss) = list . el "p" <$> cMapM toXml ss blockToXml (CodeBlock _ s) = return . spaceBeforeAfter . - map (el "p" . el "code" . T.unpack) . T.lines $ s + map (el "p" . el "code") . T.lines $ s blockToXml (RawBlock f str) = if f == Format "fb2" then @@ -346,11 +344,11 @@ blockToXml (Table _ blkCapt specs thead tbody tfoot) = do c <- el "emphasis" <$> cMapM toXml caption return [el "table" (hd <> bd), el "p" c] where - mkrow :: PandocMonad m => String -> [[Block]] -> [Alignment] -> FBM m Content + mkrow :: PandocMonad m => Text -> [[Block]] -> [Alignment] -> FBM m Content mkrow tag cells aligns' = el "tr" <$> mapM (mkcell tag) (zip cells aligns') -- - mkcell :: PandocMonad m => String -> ([Block], Alignment) -> FBM m Content + mkcell :: PandocMonad m => Text -> ([Block], Alignment) -> FBM m Content mkcell tag (cell, align) = do cblocks <- cMapM blockToXml cell return $ el tag ([align_attr align], cblocks) @@ -424,7 +422,7 @@ toXml (Quoted DoubleQuote ss) = do inner <- cMapM toXml ss return $ [txt "“"] ++ inner ++ [txt "”"] toXml (Cite _ ss) = cMapM toXml ss -- FIXME: support citation styles -toXml (Code _ s) = return [el "code" $ T.unpack s] +toXml (Code _ s) = return [el "code" s] toXml Space = return [txt " "] toXml SoftBreak = return [txt "\n"] toXml LineBreak = return [txt "\n"] @@ -456,7 +454,7 @@ insertMath immode formula = do let imgurl = url <> T.pack (urlEncode $ T.unpack formula) let img = Image nullAttr alt (imgurl, "") insertImage immode img - _ -> return [el "code" $ T.unpack formula] + _ -> return [el "code" formula] insertImage :: PandocMonad m => ImageMode -> Inline -> FBM m [Content] insertImage immode (Image _ alt (url,ttl)) = do @@ -471,31 +469,16 @@ insertImage immode (Image _ alt (url,ttl)) = do el "image" $ [ attr ("l","href") ("#" <> fname) , attr ("l","type") (tshow immode) - , uattr "alt" (T.pack $ cMap plain alt) ] + , uattr "alt" (mconcat $ map plain alt) ] ++ ttlattr insertImage _ _ = error "unexpected inline instead of image" replaceImagesWithAlt :: [Text] -> Content -> Content -replaceImagesWithAlt missingHrefs body = - let cur = XC.fromContent body - cur' = replaceAll cur - in XC.toTree . XC.root $ cur' +replaceImagesWithAlt missingHrefs = everywhere (mkT go) where - -- - replaceAll :: XC.Cursor -> XC.Cursor - replaceAll c = - let n = XC.current c - c' = if isImage n && isMissing n - then XC.modifyContent replaceNode c - else c - in case XC.nextDF c' of - (Just cnext) -> replaceAll cnext - Nothing -> c' -- end of document - -- - isImage :: Content -> Bool - isImage (Elem e) = elName e == uname "image" - isImage _ = False - -- + go c = if isMissing c + then replaceNode c + else c isMissing (Elem img@Element{}) = let imgAttrs = elAttribs img badAttrs = map (attr ("l","href")) missingHrefs @@ -505,18 +488,18 @@ replaceImagesWithAlt missingHrefs body = replaceNode :: Content -> Content replaceNode n@(Elem img@Element{}) = let attrs = elAttribs img - alt = getAttrVal attrs (uname "alt") + alt = getAttrVal attrs (unqual "alt") imtype = getAttrVal attrs (qname "l" "type") in case (alt, imtype) of (Just alt', Just imtype') -> - if imtype' == show NormalImage + if imtype' == tshow NormalImage then el "p" alt' - else txt $ T.pack alt' - (Just alt', Nothing) -> txt $ T.pack alt' -- no type attribute + else txt alt' + (Just alt', Nothing) -> txt alt' -- no type attribute _ -> n -- don't replace if alt text is not found replaceNode n = n -- - getAttrVal :: [X.Attr] -> QName -> Maybe String + getAttrVal :: [X.Attr] -> QName -> Maybe Text getAttrVal attrs name = case filter ((name ==) . attrKey) attrs of (a:_) -> Just (attrVal a) @@ -524,7 +507,7 @@ replaceImagesWithAlt missingHrefs body = -- | Wrap all inlines with an XML tag (given its unqualified name). -wrap :: PandocMonad m => String -> [Inline] -> FBM m Content +wrap :: PandocMonad m => Text -> [Inline] -> FBM m Content wrap tagname inlines = el tagname `liftM` cMapM toXml inlines -- " Create a singleton list. @@ -532,31 +515,31 @@ list :: a -> [a] list = (:[]) -- | Convert an 'Inline' to plaintext. -plain :: Inline -> String -plain (Str s) = T.unpack s -plain (Emph ss) = cMap plain ss -plain (Underline ss) = cMap plain ss -plain (Span _ ss) = cMap plain ss -plain (Strong ss) = cMap plain ss -plain (Strikeout ss) = cMap plain ss -plain (Superscript ss) = cMap plain ss -plain (Subscript ss) = cMap plain ss -plain (SmallCaps ss) = cMap plain ss -plain (Quoted _ ss) = cMap plain ss -plain (Cite _ ss) = cMap plain ss -- FIXME -plain (Code _ s) = T.unpack s +plain :: Inline -> Text +plain (Str s) = s +plain (Emph ss) = mconcat $ map plain ss +plain (Underline ss) = mconcat $ map plain ss +plain (Span _ ss) = mconcat $ map plain ss +plain (Strong ss) = mconcat $ map plain ss +plain (Strikeout ss) = mconcat $ map plain ss +plain (Superscript ss) = mconcat $ map plain ss +plain (Subscript ss) = mconcat $ map plain ss +plain (SmallCaps ss) = mconcat $ map plain ss +plain (Quoted _ ss) = mconcat $ map plain ss +plain (Cite _ ss) = mconcat $ map plain ss -- FIXME +plain (Code _ s) = s plain Space = " " plain SoftBreak = " " plain LineBreak = "\n" -plain (Math _ s) = T.unpack s +plain (Math _ s) = s plain (RawInline _ _) = "" -plain (Link _ text (url,_)) = concat (map plain text ++ [" <", T.unpack url, ">"]) -plain (Image _ alt _) = cMap plain alt +plain (Link _ text (url,_)) = mconcat (map plain text ++ [" <", url, ">"]) +plain (Image _ alt _) = mconcat $ map plain alt plain (Note _) = "" -- FIXME -- | Create an XML element. el :: (Node t) - => String -- ^ unqualified element name + => Text -- ^ unqualified element name -> t -- ^ node contents -> Content -- ^ XML content el name cs = Elem $ unode name cs @@ -569,22 +552,18 @@ spaceBeforeAfter cs = -- | Create a plain-text XML content. txt :: Text -> Content -txt s = Text $ CData CDataText (T.unpack s) Nothing +txt s = Text $ CData CDataText s Nothing -- | Create an XML attribute with an unqualified name. -uattr :: String -> Text -> Text.XML.Light.Attr -uattr name = Attr (uname name) . T.unpack +uattr :: Text -> Text -> X.Attr +uattr name = Attr (unqual name) -- | Create an XML attribute with a qualified name from given namespace. -attr :: (String, String) -> Text -> Text.XML.Light.Attr -attr (ns, name) = Attr (qname ns name) . T.unpack - --- | Unqualified name -uname :: String -> QName -uname name = QName name Nothing Nothing +attr :: (Text, Text) -> Text -> X.Attr +attr (ns, name) = Attr (qname ns name) -- | Qualified name -qname :: String -> String -> QName +qname :: Text -> Text -> QName qname ns name = QName name Nothing (Just ns) -- | Abbreviation for 'concatMap'. diff --git a/src/Text/Pandoc/Writers/ODT.hs b/src/Text/Pandoc/Writers/ODT.hs index 06369b4db..101b236aa 100644 --- a/src/Text/Pandoc/Writers/ODT.hs +++ b/src/Text/Pandoc/Writers/ODT.hs @@ -40,9 +40,9 @@ import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toTextLazy) import Text.Pandoc.Walk import Text.Pandoc.Writers.OpenDocument (writeOpenDocument) import Text.Pandoc.XML -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light import Text.TeXMath -import Text.XML.Light +import qualified Text.XML.Light as XL newtype ODTState = ODTState { stEntries :: [Entry] } @@ -181,18 +181,20 @@ updateStyleWithLang (Just lang) arch = do PandocXMLError "styles.xml" msg Right d -> return $ toEntry "styles.xml" epochtime - ( fromStringLazy + ( fromTextLazy + . TL.fromStrict . ppTopElement . addLang lang $ d ) else return e) (zEntries arch) return arch{ zEntries = entries } +-- TODO FIXME avoid this generic traversal! addLang :: Lang -> Element -> Element addLang lang = everywhere' (mkT updateLangAttr) where updateLangAttr (Attr n@(QName "language" _ (Just "fo")) _) - = Attr n (T.unpack $ langLanguage lang) + = Attr n (langLanguage lang) updateLangAttr (Attr n@(QName "country" _ (Just "fo")) _) - = Attr n (T.unpack $ langRegion lang) + = Attr n (langRegion lang) updateLangAttr x = x -- | transform both Image and Math elements @@ -238,8 +240,8 @@ transformPicMath _ (Math t math) = do case writeMathML dt <$> readTeX math of Left _ -> return $ Math t math Right r -> do - let conf = useShortEmptyTags (const False) defaultConfigPP - let mathml = ppcTopElement conf r + let conf = XL.useShortEmptyTags (const False) XL.defaultConfigPP + let mathml = XL.ppcTopElement conf r epochtime <- floor `fmap` lift P.getPOSIXTime let dirname = "Formula-" ++ show (length entries) ++ "/" let fname = dirname ++ "content.xml" diff --git a/src/Text/Pandoc/Writers/OOXML.hs b/src/Text/Pandoc/Writers/OOXML.hs index 8f60e70d5..0533d6c12 100644 --- a/src/Text/Pandoc/Writers/OOXML.hs +++ b/src/Text/Pandoc/Writers/OOXML.hs @@ -29,33 +29,32 @@ import Control.Monad.Except (throwError) import Text.Pandoc.Error import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as BL -import qualified Data.ByteString.Lazy.Char8 as BL8 import Data.Maybe (mapMaybe) import qualified Data.Text as T +import Data.Text (Text) import Text.Pandoc.Class.PandocMonad (PandocMonad) import qualified Text.Pandoc.UTF8 as UTF8 -import Text.XML.Light as XML -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light -mknode :: Node t => String -> [(String,String)] -> t -> Element +mknode :: Node t => Text -> [(Text,Text)] -> t -> Element mknode s attrs = add_attrs (map (\(k,v) -> Attr (nodename k) v) attrs) . node (nodename s) -mktnode :: String -> [(String,String)] -> T.Text -> Element -mktnode s attrs = mknode s attrs . T.unpack +mktnode :: Text -> [(Text,Text)] -> T.Text -> Element +mktnode s attrs = mknode s attrs -nodename :: String -> QName +nodename :: Text -> QName nodename s = QName{ qName = name, qURI = Nothing, qPrefix = prefix } - where (name, prefix) = case break (==':') s of - (xs,[]) -> (xs, Nothing) - (ys, _:zs) -> (zs, Just ys) + where (name, prefix) = case T.break (==':') s of + (xs,ys) -> case T.uncons ys of + Nothing -> (xs, Nothing) + Just (_,zs) -> (zs, Just xs) toLazy :: B.ByteString -> BL.ByteString toLazy = BL.fromChunks . (:[]) renderXml :: Element -> BL.ByteString -renderXml elt = BL8.pack "\n" <> - UTF8.fromStringLazy (showElement elt) +renderXml elt = BL.fromStrict (UTF8.fromText (showTopElement elt)) parseXml :: PandocMonad m => Archive -> Archive -> String -> m Element parseXml refArchive distArchive relpath = @@ -70,25 +69,25 @@ parseXml refArchive distArchive relpath = -- Copied from Util -attrToNSPair :: XML.Attr -> Maybe (String, String) -attrToNSPair (XML.Attr (QName s _ (Just "xmlns")) val) = Just (s, val) +attrToNSPair :: Attr -> Maybe (Text, Text) +attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) attrToNSPair _ = Nothing elemToNameSpaces :: Element -> NameSpaces elemToNameSpaces = mapMaybe attrToNSPair . elAttribs -elemName :: NameSpaces -> String -> String -> QName +elemName :: NameSpaces -> Text -> Text -> QName elemName ns prefix name = - QName name (lookup prefix ns) (if null prefix then Nothing else Just prefix) + QName name (lookup prefix ns) (if T.null prefix then Nothing else Just prefix) -isElem :: NameSpaces -> String -> String -> Element -> Bool +isElem :: NameSpaces -> Text -> Text -> Element -> Bool isElem ns prefix name element = let ns' = ns ++ elemToNameSpaces element in qName (elName element) == name && qURI (elName element) == lookup prefix ns' -type NameSpaces = [(String, String)] +type NameSpaces = [(Text, Text)] -- | Scales the image to fit the page -- sizes are passed in emu diff --git a/src/Text/Pandoc/Writers/Powerpoint/Output.hs b/src/Text/Pandoc/Writers/Powerpoint/Output.hs index 0a7060895..5caeb0753 100644 --- a/src/Text/Pandoc/Writers/Powerpoint/Output.hs +++ b/src/Text/Pandoc/Writers/Powerpoint/Output.hs @@ -20,16 +20,16 @@ import Control.Monad.Except (throwError, catchError) import Control.Monad.Reader import Control.Monad.State import Codec.Archive.Zip -import Data.Char (toUpper) import Data.List (intercalate, stripPrefix, nub, union, isPrefixOf, intersperse) import Data.Default +import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Read import Data.Time (formatTime, defaultTimeLocale) import Data.Time.Clock (UTCTime) import Data.Time.Clock.POSIX (utcTimeToPOSIXSeconds, posixSecondsToUTCTime) import System.FilePath.Posix (splitDirectories, splitExtension, takeExtension) -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light as XML import Text.Pandoc.Definition import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Class.PandocMonad (PandocMonad) @@ -48,6 +48,7 @@ import Text.DocTemplates (FromContext(lookupContext)) import Text.TeXMath import Text.Pandoc.Writers.Math (convertMath) import Text.Pandoc.Writers.Powerpoint.Presentation +import Text.Pandoc.Shared (tshow) import Skylighting (fromColor) -- |The 'EMU' type is used to specify sizes in English Metric Units. @@ -84,10 +85,13 @@ getPresentationSize refArchive distArchive = do sldSize <- findChild (elemName ns "p" "sldSz") presElement cxS <- findAttr (QName "cx" Nothing Nothing) sldSize cyS <- findAttr (QName "cy" Nothing Nothing) sldSize - (cx, _) <- listToMaybe $ reads cxS :: Maybe (Integer, String) - (cy, _) <- listToMaybe $ reads cyS :: Maybe (Integer, String) + cx <- readTextAsInteger cxS + cy <- readTextAsInteger cyS return (cx `div` 12700, cy `div` 12700) +readTextAsInteger :: Text -> Maybe Integer +readTextAsInteger = either (const Nothing) (Just . fst) . Data.Text.Read.decimal + data WriterEnv = WriterEnv { envRefArchive :: Archive , envDistArchive :: Archive , envUTCTime :: UTCTime @@ -161,9 +165,6 @@ runP env st p = evalStateT (runReaderT p env) st -------------------------------------------------------------------- -findAttrText :: QName -> Element -> Maybe T.Text -findAttrText n = fmap T.pack . findAttr n - monospaceFont :: Monad m => P m T.Text monospaceFont = do vars <- writerVariables <$> asks envOpts @@ -171,10 +172,9 @@ monospaceFont = do Just s -> return s Nothing -> return "Courier" --- Kept as string for XML.Light -fontSizeAttributes :: Monad m => RunProps -> P m [(String, String)] +fontSizeAttributes :: Monad m => RunProps -> P m [(Text, Text)] fontSizeAttributes RunProps { rPropForceSize = Just sz } = - return [("sz", show $ sz * 100)] + return [("sz", tshow $ sz * 100)] fontSizeAttributes _ = return [] copyFileToArchive :: PandocMonad m => Archive -> FilePath -> P m Archive @@ -365,7 +365,7 @@ shapeHasId :: NameSpaces -> T.Text -> Element -> Bool shapeHasId ns ident element | Just nvSpPr <- findChild (elemName ns "p" "nvSpPr") element , Just cNvPr <- findChild (elemName ns "p" "cNvPr") nvSpPr - , Just nm <- findAttrText (QName "id" Nothing Nothing) cNvPr = + , Just nm <- findAttr (QName "id" Nothing Nothing) cNvPr = nm == ident | otherwise = False @@ -396,10 +396,10 @@ getShapeDimensions ns element ext <- findChild (elemName ns "a" "ext") xfrm cxS <- findAttr (QName "cx" Nothing Nothing) ext cyS <- findAttr (QName "cy" Nothing Nothing) ext - (x, _) <- listToMaybe $ reads xS - (y, _) <- listToMaybe $ reads yS - (cx, _) <- listToMaybe $ reads cxS - (cy, _) <- listToMaybe $ reads cyS + x <- readTextAsInteger xS + y <- readTextAsInteger yS + cx <- readTextAsInteger cxS + cy <- readTextAsInteger cyS return ((x `div` 12700, y `div` 12700), (cx `div` 12700, cy `div` 12700)) | otherwise = Nothing @@ -430,7 +430,7 @@ getContentShapeSize ns layout master Nothing -> do let mbSz = findChild (elemName ns "p" "nvSpPr") sp >>= findChild (elemName ns "p" "cNvPr") >>= - findAttrText (QName "id" Nothing Nothing) >>= + findAttr (QName "id" Nothing Nothing) >>= flip getMasterShapeDimensionsById master case mbSz of Just sz' -> return sz' @@ -450,8 +450,8 @@ buildSpTree ns spTreeElem newShapes = fn _ = True replaceNamedChildren :: NameSpaces - -> String - -> String + -> Text + -> Text -> [Element] -> Element -> Element @@ -654,10 +654,10 @@ createCaption contentShapeDimensions paraElements = do ] , mknode "p:spPr" [] [ mknode "a:xfrm" [] - [ mknode "a:off" [("x", show $ 12700 * x), - ("y", show $ 12700 * (y + cy - captionHeight))] () - , mknode "a:ext" [("cx", show $ 12700 * cx), - ("cy", show $ 12700 * captionHeight)] () + [ mknode "a:off" [("x", tshow $ 12700 * x), + ("y", tshow $ 12700 * (y + cy - captionHeight))] () + , mknode "a:ext" [("cx", tshow $ 12700 * cx), + ("cy", tshow $ 12700 * captionHeight)] () ] , mknode "a:prstGeom" [("prst", "rect")] [ mknode "a:avLst" [] () @@ -706,11 +706,13 @@ makePicElements layout picProps mInfo alt = do ,("noChangeAspect","1")] () -- cNvPr will contain the link information so we do that separately, -- and register the link if necessary. - let cNvPrAttr = [("descr", mInfoFilePath mInfo), ("id","0"),("name","Picture 1")] + let cNvPrAttr = [("descr", T.pack $ mInfoFilePath mInfo), + ("id","0"), + ("name","Picture 1")] cNvPr <- case picPropLink picProps of Just link -> do idNum <- registerLink link return $ mknode "p:cNvPr" cNvPrAttr $ - mknode "a:hlinkClick" [("r:id", "rId" <> show idNum)] () + mknode "a:hlinkClick" [("r:id", "rId" <> tshow idNum)] () Nothing -> return $ mknode "p:cNvPr" cNvPrAttr () let nvPicPr = mknode "p:nvPicPr" [] [ cNvPr @@ -718,13 +720,13 @@ makePicElements layout picProps mInfo alt = do , mknode "p:nvPr" [] ()] let blipFill = mknode "p:blipFill" [] [ mknode "a:blip" [("r:embed", "rId" <> - show (mInfoLocalId mInfo))] () + tshow (mInfoLocalId mInfo))] () , mknode "a:stretch" [] $ mknode "a:fillRect" [] () ] let xfrm = mknode "a:xfrm" [] - [ mknode "a:off" [("x",show xoff'), ("y",show yoff')] () - , mknode "a:ext" [("cx",show dimX') - ,("cy",show dimY')] () ] + [ mknode "a:off" [("x", tshow xoff'), ("y", tshow yoff')] () + , mknode "a:ext" [("cx", tshow dimX') + ,("cy", tshow dimY')] () ] let prstGeom = mknode "a:prstGeom" [("prst","rect")] $ mknode "a:avLst" [] () let ln = mknode "a:ln" [("w","9525")] @@ -763,7 +765,7 @@ paraElemToElements (Run rpr s) = do Just DoubleStrike -> [("strike", "dblStrike")] Nothing -> []) <> (case rBaseline rpr of - Just n -> [("baseline", show n)] + Just n -> [("baseline", tshow n)] Nothing -> []) <> (case rCap rpr of Just NoCapitals -> [("cap", "none")] @@ -780,43 +782,44 @@ paraElemToElements (Run rpr s) = do return $ case link of InternalTarget _ -> let linkAttrs = - [ ("r:id", "rId" <> show idNum) + [ ("r:id", "rId" <> tshow idNum) , ("action", "ppaction://hlinksldjump") ] in [mknode "a:hlinkClick" linkAttrs ()] -- external ExternalTarget _ -> let linkAttrs = - [ ("r:id", "rId" <> show idNum) + [ ("r:id", "rId" <> tshow idNum) ] in [mknode "a:hlinkClick" linkAttrs ()] Nothing -> return [] let colorContents = case rSolidFill rpr of Just color -> case fromColor color of - '#':hx -> [mknode "a:solidFill" [] - [mknode "a:srgbClr" [("val", map toUpper hx)] ()] - ] + '#':hx -> + [mknode "a:solidFill" [] + [mknode "a:srgbClr" + [("val", T.toUpper $ T.pack hx)] ()]] _ -> [] Nothing -> [] codeFont <- monospaceFont let codeContents = - [mknode "a:latin" [("typeface", T.unpack codeFont)] () | rPropCode rpr] + [mknode "a:latin" [("typeface", codeFont)] () | rPropCode rpr] let propContents = linkProps <> colorContents <> codeContents return [Elem $ mknode "a:r" [] [ mknode "a:rPr" attrs propContents - , mknode "a:t" [] $ T.unpack s + , mknode "a:t" [] s ]] paraElemToElements (MathElem mathType texStr) = do isInSpkrNotes <- asks envInSpeakerNotes if isInSpkrNotes then paraElemToElements $ Run def $ unTeXString texStr else do res <- convertMath writeOMML mathType (unTeXString texStr) - case res of + case fromXLElement <$> res of Right r -> return [Elem $ mknode "a14:m" [] $ addMathInfo r] Left (Str s) -> paraElemToElements (Run def s) Left _ -> throwError $ PandocShouldNeverHappenError "non-string math fallback" paraElemToElements (RawOOXMLParaElem str) = return - [Text (CData CDataRaw (T.unpack str) Nothing)] + [Text (CData CDataRaw str Nothing)] -- This is a bit of a kludge -- really requires adding an option to @@ -824,9 +827,10 @@ paraElemToElements (RawOOXMLParaElem str) = return -- step at a time. addMathInfo :: Element -> Element addMathInfo element = - let mathspace = Attr { attrKey = QName "m" Nothing (Just "xmlns") - , attrVal = "http://schemas.openxmlformats.org/officeDocument/2006/math" - } + let mathspace = + Attr { attrKey = QName "m" Nothing (Just "xmlns") + , attrVal = "http://schemas.openxmlformats.org/officeDocument/2006/math" + } in add_attr mathspace element -- We look through the element to see if it contains an a14:m @@ -849,13 +853,13 @@ surroundWithMathAlternate element = paragraphToElement :: PandocMonad m => Paragraph -> P m Element paragraphToElement par = do let - attrs = [("lvl", show $ pPropLevel $ paraProps par)] <> + attrs = [("lvl", tshow $ pPropLevel $ paraProps par)] <> (case pPropMarginLeft (paraProps par) of - Just px -> [("marL", show $ pixelsToEmu px)] + Just px -> [("marL", tshow $ pixelsToEmu px)] Nothing -> [] ) <> (case pPropIndent (paraProps par) of - Just px -> [("indent", show $ pixelsToEmu px)] + Just px -> [("indent", tshow $ pixelsToEmu px)] Nothing -> [] ) <> (case pPropAlign (paraProps par) of @@ -867,7 +871,7 @@ paragraphToElement par = do props = [] <> (case pPropSpaceBefore $ paraProps par of Just px -> [mknode "a:spcBef" [] [ - mknode "a:spcPts" [("val", show $ 100 * px)] () + mknode "a:spcPts" [("val", tshow $ 100 * px)] () ] ] Nothing -> [] @@ -910,7 +914,7 @@ shapeToElements layout (Pic picProps fp alt) = do shapeToElements layout (GraphicFrame tbls cptn) = map Elem <$> graphicFrameToElements layout tbls cptn shapeToElements _ (RawOOXMLShape str) = return - [Text (CData CDataRaw (T.unpack str) Nothing)] + [Text (CData CDataRaw str Nothing)] shapeToElements layout shp = do element <- shapeToElement layout shp return [Elem element] @@ -942,8 +946,10 @@ graphicFrameToElements layout tbls caption = do [mknode "p:ph" [("idx", "1")] ()] ] , mknode "p:xfrm" [] - [ mknode "a:off" [("x", show $ 12700 * x), ("y", show $ 12700 * y)] () - , mknode "a:ext" [("cx", show $ 12700 * cx), ("cy", show $ 12700 * cy)] () + [ mknode "a:off" [("x", tshow $ 12700 * x), + ("y", tshow $ 12700 * y)] () + , mknode "a:ext" [("cx", tshow $ 12700 * cx), + ("cy", tshow $ 12700 * cy)] () ] ] <> elements @@ -957,7 +963,7 @@ getDefaultTableStyle = do refArchive <- asks envRefArchive distArchive <- asks envDistArchive tblStyleLst <- parseXml refArchive distArchive "ppt/tableStyles.xml" - return $ findAttrText (QName "def" Nothing Nothing) tblStyleLst + return $ findAttr (QName "def" Nothing Nothing) tblStyleLst graphicToElement :: PandocMonad m => Integer -> Graphic -> P m Element graphicToElement tableWidth (Tbl tblPr hdrCells rows) = do @@ -995,7 +1001,7 @@ graphicToElement tableWidth (Tbl tblPr hdrCells rows) = do let mkrow border cells = mknode "a:tr" [("h", "0")] $ map (mkcell border) cells let mkgridcol w = mknode "a:gridCol" - [("w", show ((12700 * w) :: Integer))] () + [("w", tshow ((12700 * w) :: Integer))] () let hasHeader = not (all null hdrCells) mbDefTblStyle <- getDefaultTableStyle @@ -1004,7 +1010,7 @@ graphicToElement tableWidth (Tbl tblPr hdrCells rows) = do , ("bandRow", if tblPrBandRow tblPr then "1" else "0") ] (case mbDefTblStyle of Nothing -> [] - Just sty -> [mknode "a:tableStyleId" [] $ T.unpack sty]) + Just sty -> [mknode "a:tableStyleId" [] sty]) return $ mknode "a:graphic" [] [mknode "a:graphicData" [("uri", "http://schemas.openxmlformats.org/drawingml/2006/table")] @@ -1037,7 +1043,7 @@ findPHType ns spElem phType -- if it's a named PHType, we want to check that the attribute -- value matches. Just phElem | (PHType tp) <- phType -> - case findAttrText (QName "type" Nothing Nothing) phElem of + case findAttr (QName "type" Nothing Nothing) phElem of Just tp' -> tp == tp' Nothing -> False -- if it's an ObjType, we want to check that there is NO @@ -1204,7 +1210,7 @@ getSlideNumberFieldId notesMaster , Just txBody <- findChild (elemName ns "p" "txBody") sp , Just p <- findChild (elemName ns "a" "p") txBody , Just fld <- findChild (elemName ns "a" "fld") p - , Just fldId <- findAttrText (QName "id" Nothing Nothing) fld = + , Just fldId <- findAttr (QName "id" Nothing Nothing) fld = return fldId | otherwise = throwError $ PandocSomeError @@ -1283,11 +1289,11 @@ speakerNotesSlideNumber pgNum fieldId = [ mknode "a:bodyPr" [] () , mknode "a:lstStyle" [] () , mknode "a:p" [] - [ mknode "a:fld" [ ("id", T.unpack fieldId) + [ mknode "a:fld" [ ("id", fieldId) , ("type", "slidenum") ] [ mknode "a:rPr" [("lang", "en-US")] () - , mknode "a:t" [] (show pgNum) + , mknode "a:t" [] (tshow pgNum) ] , mknode "a:endParaRPr" [("lang", "en-US")] () ] @@ -1339,7 +1345,7 @@ getSlideIdNum sldId = do Just n -> return n Nothing -> throwError $ PandocShouldNeverHappenError $ - "Slide Id " <> T.pack (show sldId) <> " not found." + "Slide Id " <> tshow sldId <> " not found." slideNum :: PandocMonad m => Slide -> P m Int slideNum slide = getSlideIdNum $ slideId slide @@ -1356,7 +1362,7 @@ slideToRelId :: PandocMonad m => Slide -> P m T.Text slideToRelId slide = do n <- slideNum slide offset <- asks envSlideIdOffset - return $ "rId" <> T.pack (show $ n + offset) + return $ "rId" <> tshow (n + offset) data Relationship = Relationship { relId :: Int @@ -1368,13 +1374,11 @@ elementToRel :: Element -> Maybe Relationship elementToRel element | elName element == QName "Relationship" (Just "http://schemas.openxmlformats.org/package/2006/relationships") Nothing = do rId <- findAttr (QName "Id" Nothing Nothing) element - numStr <- stripPrefix "rId" rId - num <- case reads numStr :: [(Int, String)] of - (n, _) : _ -> Just n - [] -> Nothing - type' <- findAttrText (QName "Type" Nothing Nothing) element + numStr <- T.stripPrefix "rId" rId + num <- fromIntegral <$> readTextAsInteger numStr + type' <- findAttr (QName "Type" Nothing Nothing) element target <- findAttr (QName "Target" Nothing Nothing) element - return $ Relationship num type' target + return $ Relationship num type' (T.unpack target) | otherwise = Nothing slideToPresRel :: PandocMonad m => Slide -> P m Relationship @@ -1463,10 +1467,9 @@ topLevelRelsEntry :: PandocMonad m => P m Entry topLevelRelsEntry = elemToEntry "_rels/.rels" $ relsToElement topLevelRels relToElement :: Relationship -> Element -relToElement rel = mknode "Relationship" [ ("Id", "rId" <> - show (relId rel)) - , ("Type", T.unpack $ relType rel) - , ("Target", relTarget rel) ] () +relToElement rel = mknode "Relationship" [ ("Id", "rId" <> tshow (relId rel)) + , ("Type", relType rel) + , ("Target", T.pack $ relTarget rel) ] () relsToElement :: [Relationship] -> Element relsToElement rels = mknode "Relationships" @@ -1501,7 +1504,8 @@ slideToSpeakerNotesEntry slide = do Just element | Just notesIdNum <- mbNotesIdNum -> Just <$> elemToEntry - ("ppt/notesSlides/notesSlide" <> show notesIdNum <> ".xml") + ("ppt/notesSlides/notesSlide" <> show notesIdNum <> + ".xml") element _ -> return Nothing @@ -1514,7 +1518,7 @@ slideToSpeakerNotesRelElement slide@Slide{} = do [("xmlns", "http://schemas.openxmlformats.org/package/2006/relationships")] [ mknode "Relationship" [ ("Id", "rId2") , ("Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide") - , ("Target", "../slides/slide" <> show idNum <> ".xml") + , ("Target", "../slides/slide" <> tshow idNum <> ".xml") ] () , mknode "Relationship" [ ("Id", "rId1") , ("Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesMaster") @@ -1547,15 +1551,15 @@ linkRelElement :: PandocMonad m => (Int, LinkTarget) -> P m Element linkRelElement (rIdNum, InternalTarget targetId) = do targetIdNum <- getSlideIdNum targetId return $ - mknode "Relationship" [ ("Id", "rId" <> show rIdNum) + mknode "Relationship" [ ("Id", "rId" <> tshow rIdNum) , ("Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide") - , ("Target", "slide" <> show targetIdNum <> ".xml") + , ("Target", "slide" <> tshow targetIdNum <> ".xml") ] () linkRelElement (rIdNum, ExternalTarget (url, _)) = return $ - mknode "Relationship" [ ("Id", "rId" <> show rIdNum) + mknode "Relationship" [ ("Id", "rId" <> tshow rIdNum) , ("Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink") - , ("Target", T.unpack url) + , ("Target", url) , ("TargetMode", "External") ] () @@ -1567,10 +1571,10 @@ mediaRelElement mInfo = let ext = fromMaybe "" (mInfoExt mInfo) in mknode "Relationship" [ ("Id", "rId" <> - show (mInfoLocalId mInfo)) + tshow (mInfoLocalId mInfo)) , ("Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image") , ("Target", "../media/image" <> - show (mInfoGlobalId mInfo) <> T.unpack ext) + tshow (mInfoGlobalId mInfo) <> ext) ] () speakerNotesSlideRelElement :: PandocMonad m => Slide -> P m (Maybe Element) @@ -1580,7 +1584,7 @@ speakerNotesSlideRelElement slide = do return $ case M.lookup idNum mp of Nothing -> Nothing Just n -> - let target = "../notesSlides/notesSlide" <> show n <> ".xml" + let target = "../notesSlides/notesSlide" <> tshow n <> ".xml" in Just $ mknode "Relationship" [ ("Id", "rId2") , ("Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide") @@ -1619,9 +1623,9 @@ slideToSlideRelElement slide = do slideToSldIdElement :: PandocMonad m => Slide -> P m Element slideToSldIdElement slide = do n <- slideNum slide - let id' = show $ n + 255 + let id' = tshow $ n + 255 rId <- slideToRelId slide - return $ mknode "p:sldId" [("id", id'), ("r:id", T.unpack rId)] () + return $ mknode "p:sldId" [("id", id'), ("r:id", rId)] () presentationToSldIdLst :: PandocMonad m => Presentation -> P m Element presentationToSldIdLst (Presentation _ slides) = do @@ -1646,7 +1650,7 @@ presentationToPresentationElement pres@(Presentation _ slds) = do notesMasterElem = mknode "p:notesMasterIdLst" [] [ mknode "p:NotesMasterId" - [("r:id", "rId" <> show notesMasterRId)] + [("r:id", "rId" <> tshow notesMasterRId)] () ] @@ -1702,17 +1706,17 @@ docPropsElement docProps = do ,("xmlns:dcmitype","http://purl.org/dc/dcmitype/") ,("xmlns:xsi","http://www.w3.org/2001/XMLSchema-instance")] $ - mknode "dc:title" [] (maybe "" T.unpack $ dcTitle docProps) + mknode "dc:title" [] (fromMaybe "" $ dcTitle docProps) : - mknode "dc:creator" [] (maybe "" T.unpack $ dcCreator docProps) + mknode "dc:creator" [] (fromMaybe "" $ dcCreator docProps) : - mknode "cp:keywords" [] (T.unpack keywords) - : ( [mknode "dc:subject" [] $ maybe "" T.unpack $ dcSubject docProps | isJust (dcSubject docProps)]) - <> ( [mknode "dc:description" [] $ maybe "" T.unpack $ dcDescription docProps | isJust (dcDescription docProps)]) - <> ( [mknode "cp:category" [] $ maybe "" T.unpack $ cpCategory docProps | isJust (cpCategory docProps)]) + mknode "cp:keywords" [] keywords + : ( [mknode "dc:subject" [] $ fromMaybe "" $ dcSubject docProps | isJust (dcSubject docProps)]) + <> ( [mknode "dc:description" [] $ fromMaybe "" $ dcDescription docProps | isJust (dcDescription docProps)]) + <> ( [mknode "cp:category" [] $ fromMaybe "" $ cpCategory docProps | isJust (cpCategory docProps)]) <> (\x -> [ mknode "dcterms:created" [("xsi:type","dcterms:W3CDTF")] x - , mknode "dcterms:modified" [("xsi:type","dcterms:W3CDTF")] x - ]) (formatTime defaultTimeLocale "%FT%XZ" utctime) + , mknode "dcterms:modified" [("xsi:type","dcterms:W3CDTF")] x + ]) (T.pack $ formatTime defaultTimeLocale "%FT%XZ" utctime) docPropsToEntry :: PandocMonad m => DocProps -> P m Entry docPropsToEntry docProps = docPropsElement docProps >>= @@ -1723,8 +1727,8 @@ docCustomPropsElement :: PandocMonad m => DocProps -> P m Element docCustomPropsElement docProps = do let mkCustomProp (k, v) pid = mknode "property" [("fmtid","{D5CDD505-2E9C-101B-9397-08002B2CF9AE}") - ,("pid", show pid) - ,("name", T.unpack k)] $ mknode "vt:lpwstr" [] (T.unpack v) + ,("pid", tshow pid) + ,("name", k)] $ mknode "vt:lpwstr" [] v return $ mknode "Properties" [("xmlns","http://schemas.openxmlformats.org/officeDocument/2006/custom-properties") ,("xmlns:vt","http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes") @@ -1743,7 +1747,7 @@ viewPropsElement = do distArchive <- asks envDistArchive viewPrElement <- parseXml refArchive distArchive "ppt/viewProps.xml" -- remove "lastView" if it exists: - let notLastView :: Text.XML.Light.Attr -> Bool + let notLastView :: XML.Attr -> Bool notLastView attr = qName (attrKey attr) /= "lastView" return $ @@ -1755,15 +1759,15 @@ makeViewPropsEntry = viewPropsElement >>= elemToEntry "ppt/viewProps.xml" defaultContentTypeToElem :: DefaultContentType -> Element defaultContentTypeToElem dct = mknode "Default" - [("Extension", T.unpack $ defContentTypesExt dct), - ("ContentType", T.unpack $ defContentTypesType dct)] + [("Extension", defContentTypesExt dct), + ("ContentType", defContentTypesType dct)] () overrideContentTypeToElem :: OverrideContentType -> Element overrideContentTypeToElem oct = mknode "Override" - [("PartName", overrideContentTypesPart oct), - ("ContentType", T.unpack $ overrideContentTypesType oct)] + [("PartName", T.pack $ overrideContentTypesPart oct), + ("ContentType", overrideContentTypesType oct)] () contentTypesToElement :: ContentTypes -> Element @@ -1821,7 +1825,8 @@ getSpeakerNotesFilePaths :: PandocMonad m => P m [FilePath] getSpeakerNotesFilePaths = do mp <- asks envSpeakerNotesIdMap let notesIdNums = M.elems mp - return $ map (\n -> "ppt/notesSlides/notesSlide" <> show n <> ".xml") notesIdNums + return $ map (\n -> "ppt/notesSlides/notesSlide" <> show n <> ".xml") + notesIdNums presentationToContentTypes :: PandocMonad m => Presentation -> P m ContentTypes presentationToContentTypes p@(Presentation _ slides) = do @@ -1885,11 +1890,11 @@ getContentType fp | otherwise = Nothing -- Kept as String for XML.Light -autoNumAttrs :: ListAttributes -> [(String, String)] +autoNumAttrs :: ListAttributes -> [(Text, Text)] autoNumAttrs (startNum, numStyle, numDelim) = numAttr <> typeAttr where - numAttr = [("startAt", show startNum) | startNum /= 1] + numAttr = [("startAt", tshow startNum) | startNum /= 1] typeAttr = [("type", typeString <> delimString)] typeString = case numStyle of Decimal -> "arabic" diff --git a/src/Text/Pandoc/XML/Light.hs b/src/Text/Pandoc/XML/Light.hs new file mode 100644 index 000000000..38e4df218 --- /dev/null +++ b/src/Text/Pandoc/XML/Light.hs @@ -0,0 +1,586 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE DeriveDataTypeable #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.XML.Light + Copyright : Copyright (C) 2021 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +xml-light, which we used in pandoc's the XML-based readers, has +some limitations: in particular, it produces nodes with String +instead of Text, and the parser falls over on processing instructions +(see #7091). + +This module exports much of the API of xml-light, but using Text instead +of String. In addition, the xml-light parsers are replaced by xml-conduit's +well-tested parser. (The xml-conduit types are mapped to types +isomorphic to xml-light's, to avoid the need for massive code modifications +elsewhere.) Bridge functions to map xml-light types to this module's +types are also provided (since libraries like texmath still use xml-light). + +Another advantage of the xml-conduit parser is that it gives us +detailed information on xml parse errors. + +In the future we may want to move to using xml-conduit or another +xml library in the code base, but this change gives us +better performance and accuracy without much change in the +code that used xml-light. +-} +module Text.Pandoc.XML.Light + ( -- * Basic types, duplicating those from xml-light but with Text + -- instead of String + Line + , Content(..) + , Element(..) + , Attr(..) + , CData(..) + , CDataKind(..) + , QName(..) + , Node(..) + , unode + , unqual + , add_attr + , add_attrs + -- * Conversion functions from xml-light types + , fromXLQName + , fromXLCData + , fromXLElement + , fromXLAttr + , fromXLContent + -- * Replacement for xml-light's Text.XML.Proc + , strContent + , onlyElems + , elChildren + , onlyText + , findChildren + , filterChildren + , filterChildrenName + , findChild + , filterChild + , filterChildName + , findElement + , filterElement + , filterElementName + , findElements + , filterElements + , filterElementsName + , findAttr + , lookupAttr + , lookupAttrBy + , findAttrBy + -- * Replacement for xml-light's Text.XML.Output + , ppTopElement + , ppElement + , ppContent + , ppcElement + , ppcContent + , showTopElement + , showElement + , showContent + , useShortEmptyTags + , defaultConfigPP + , ConfigPP(..) + -- * Replacement for xml-light's Text.XML.Input + , parseXMLElement + , parseXMLContents + ) where + +import qualified Control.Exception as E +import qualified Text.XML as Conduit +import Text.XML.Unresolved (InvalidEventStream(..)) +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Lazy as TL +import Data.Text.Lazy.Builder (Builder, singleton, fromText, toLazyText) +import qualified Data.Map as M +import Data.Data (Data) +import Data.Typeable (Typeable) +import Data.Maybe (mapMaybe, listToMaybe) +import Data.List(find) +import qualified Text.XML.Light as XL + +-- Drop in replacement for parseXMLDoc in xml-light. +parseXMLElement :: TL.Text -> Either T.Text Element +parseXMLElement t = + elementToElement . Conduit.documentRoot <$> + either (Left . T.pack . E.displayException) Right + (Conduit.parseText Conduit.def{ Conduit.psRetainNamespaces = True } t) + +parseXMLContents :: TL.Text -> Either T.Text [Content] +parseXMLContents t = + case Conduit.parseText Conduit.def{ Conduit.psRetainNamespaces = True } t of + Left e -> + case E.fromException e of + Just (ContentAfterRoot _) -> + elContent <$> parseXMLElement ("" <> t <> "") + _ -> Left . T.pack . E.displayException $ e + Right x -> Right [Elem . elementToElement . Conduit.documentRoot $ x] + +elementToElement :: Conduit.Element -> Element +elementToElement (Conduit.Element name attribMap nodes) = + Element (nameToQname name) attrs (mapMaybe nodeToContent nodes) Nothing + where + attrs = map (\(n,v) -> Attr (nameToQname n) v) $ + M.toList attribMap + nameToQname (Conduit.Name localName mbns mbpref) = + case mbpref of + Nothing -> + case T.stripPrefix "xmlns:" localName of + Just rest -> QName rest mbns (Just "xmlns") + Nothing -> QName localName mbns mbpref + _ -> QName localName mbns mbpref + +nodeToContent :: Conduit.Node -> Maybe Content +nodeToContent (Conduit.NodeElement el) = + Just (Elem (elementToElement el)) +nodeToContent (Conduit.NodeContent t) = + Just (Text (CData CDataText t Nothing)) +nodeToContent _ = Nothing + +unqual :: Text -> QName +unqual x = QName x Nothing Nothing + +-- | Add an attribute to an element. +add_attr :: Attr -> Element -> Element +add_attr a e = add_attrs [a] e + +-- | Add some attributes to an element. +add_attrs :: [Attr] -> Element -> Element +add_attrs as e = e { elAttribs = as ++ elAttribs e } + +-- +-- type definitions lightly modified from xml-light +-- + +-- | A line is an Integer +type Line = Integer + +-- | XML content +data Content = Elem Element + | Text CData + | CRef Text + deriving (Show, Typeable, Data) + +-- | XML elements +data Element = Element { + elName :: QName, + elAttribs :: [Attr], + elContent :: [Content], + elLine :: Maybe Line + } deriving (Show, Typeable, Data) + +-- | XML attributes +data Attr = Attr { + attrKey :: QName, + attrVal :: Text + } deriving (Eq, Ord, Show, Typeable, Data) + +-- | XML CData +data CData = CData { + cdVerbatim :: CDataKind, + cdData :: Text, + cdLine :: Maybe Line + } deriving (Show, Typeable, Data) + +data CDataKind + = CDataText -- ^ Ordinary character data; pretty printer escapes &, < etc. + | CDataVerbatim -- ^ Unescaped character data; pretty printer embeds it in case (qURI q1, qURI q2) of + (Nothing,Nothing) -> compare (qPrefix q1) (qPrefix q2) + (u1,u2) -> compare u1 u2 + x -> x + +class Node t where + node :: QName -> t -> Element + +instance Node ([Attr],[Content]) where + node n (attrs,cont) = Element { elName = n + , elAttribs = attrs + , elContent = cont + , elLine = Nothing + } + +instance Node [Attr] where node n as = node n (as,[]::[Content]) +instance Node Attr where node n a = node n [a] +instance Node () where node n () = node n ([]::[Attr]) + +instance Node [Content] where node n cs = node n ([]::[Attr],cs) +instance Node Content where node n c = node n [c] +instance Node ([Attr],Content) where node n (as,c) = node n (as,[c]) +instance Node (Attr,Content) where node n (a,c) = node n ([a],[c]) + +instance Node ([Attr],[Element]) where + node n (as,cs) = node n (as,map Elem cs) + +instance Node ([Attr],Element) where node n (as,c) = node n (as,[c]) +instance Node (Attr,Element) where node n (a,c) = node n ([a],c) +instance Node [Element] where node n es = node n ([]::[Attr],es) +instance Node Element where node n e = node n [e] + +instance Node ([Attr],[CData]) where + node n (as,cs) = node n (as,map Text cs) + +instance Node ([Attr],CData) where node n (as,c) = node n (as,[c]) +instance Node (Attr,CData) where node n (a,c) = node n ([a],c) +instance Node [CData] where node n es = node n ([]::[Attr],es) +instance Node CData where node n e = node n [e] + +instance Node ([Attr],Text) where + node n (as,t) = node n (as, CData { cdVerbatim = CDataText + , cdData = t + , cdLine = Nothing }) + +instance Node (Attr,Text ) where node n (a,t) = node n ([a],t) +instance Node Text where node n t = node n ([]::[Attr],t) + +-- | Create node with unqualified name +unode :: Node t => Text -> t -> Element +unode = node . unqual + +-- +-- conversion from xml-light +-- + +fromXLQName :: XL.QName -> QName +fromXLQName qn = QName { qName = T.pack $ XL.qName qn + , qURI = T.pack <$> XL.qURI qn + , qPrefix = T.pack <$> XL.qPrefix qn } + +fromXLCData :: XL.CData -> CData +fromXLCData cd = CData { cdVerbatim = case XL.cdVerbatim cd of + XL.CDataText -> CDataText + XL.CDataVerbatim -> CDataVerbatim + XL.CDataRaw -> CDataRaw + , cdData = T.pack $ XL.cdData cd + , cdLine = XL.cdLine cd } + +fromXLElement :: XL.Element -> Element +fromXLElement el = Element { elName = fromXLQName $ XL.elName el + , elAttribs = map fromXLAttr $ XL.elAttribs el + , elContent = map fromXLContent $ XL.elContent el + , elLine = XL.elLine el } + +fromXLAttr :: XL.Attr -> Attr +fromXLAttr (XL.Attr qn s) = Attr (fromXLQName qn) (T.pack s) + +fromXLContent :: XL.Content -> Content +fromXLContent (XL.Elem el) = Elem $ fromXLElement el +fromXLContent (XL.Text cd) = Text $ fromXLCData cd +fromXLContent (XL.CRef s) = CRef (T.pack s) + +-- +-- copied from xml-light Text.XML.Proc +-- + +-- | Get the text value of an XML element. This function +-- ignores non-text elements, and concatenates all text elements. +strContent :: Element -> Text +strContent = mconcat . map cdData . onlyText . elContent + +-- | Select only the elements from a list of XML content. +onlyElems :: [Content] -> [Element] +onlyElems xs = [ x | Elem x <- xs ] + +-- | Select only the elements from a parent. +elChildren :: Element -> [Element] +elChildren e = [ x | Elem x <- elContent e ] + +-- | Select only the text from a list of XML content. +onlyText :: [Content] -> [CData] +onlyText xs = [ x | Text x <- xs ] + +-- | Find all immediate children with the given name. +findChildren :: QName -> Element -> [Element] +findChildren q e = filterChildren ((q ==) . elName) e + +-- | Filter all immediate children wrt a given predicate. +filterChildren :: (Element -> Bool) -> Element -> [Element] +filterChildren p e = filter p (onlyElems (elContent e)) + + +-- | Filter all immediate children wrt a given predicate over their names. +filterChildrenName :: (QName -> Bool) -> Element -> [Element] +filterChildrenName p e = filter (p.elName) (onlyElems (elContent e)) + + +-- | Find an immediate child with the given name. +findChild :: QName -> Element -> Maybe Element +findChild q e = listToMaybe (findChildren q e) + +-- | Find an immediate child with the given name. +filterChild :: (Element -> Bool) -> Element -> Maybe Element +filterChild p e = listToMaybe (filterChildren p e) + +-- | Find an immediate child with name matching a predicate. +filterChildName :: (QName -> Bool) -> Element -> Maybe Element +filterChildName p e = listToMaybe (filterChildrenName p e) + +-- | Find the left-most occurrence of an element matching given name. +findElement :: QName -> Element -> Maybe Element +findElement q e = listToMaybe (findElements q e) + +-- | Filter the left-most occurrence of an element wrt. given predicate. +filterElement :: (Element -> Bool) -> Element -> Maybe Element +filterElement p e = listToMaybe (filterElements p e) + +-- | Filter the left-most occurrence of an element wrt. given predicate. +filterElementName :: (QName -> Bool) -> Element -> Maybe Element +filterElementName p e = listToMaybe (filterElementsName p e) + +-- | Find all non-nested occurances of an element. +-- (i.e., once we have found an element, we do not search +-- for more occurances among the element's children). +findElements :: QName -> Element -> [Element] +findElements qn e = filterElementsName (qn==) e + +-- | Find all non-nested occurrences of an element wrt. given predicate. +-- (i.e., once we have found an element, we do not search +-- for more occurances among the element's children). +filterElements :: (Element -> Bool) -> Element -> [Element] +filterElements p e + | p e = [e] + | otherwise = concatMap (filterElements p) $ onlyElems $ elContent e + +-- | Find all non-nested occurences of an element wrt a predicate over element names. +-- (i.e., once we have found an element, we do not search +-- for more occurances among the element's children). +filterElementsName :: (QName -> Bool) -> Element -> [Element] +filterElementsName p e = filterElements (p.elName) e + +-- | Lookup the value of an attribute. +findAttr :: QName -> Element -> Maybe Text +findAttr x e = lookupAttr x (elAttribs e) + +-- | Lookup attribute name from list. +lookupAttr :: QName -> [Attr] -> Maybe Text +lookupAttr x = lookupAttrBy (x ==) + +-- | Lookup the first attribute whose name satisfies the given predicate. +lookupAttrBy :: (QName -> Bool) -> [Attr] -> Maybe Text +lookupAttrBy p as = attrVal `fmap` find (p . attrKey) as + +-- | Lookup the value of the first attribute whose name +-- satisfies the given predicate. +findAttrBy :: (QName -> Bool) -> Element -> Maybe Text +findAttrBy p e = lookupAttrBy p (elAttribs e) + + +-- +-- duplicates functinos from Text.XML.Output +-- + +-- | The XML 1.0 header +xmlHeader :: Text +xmlHeader = "" + + +-------------------------------------------------------------------------------- +data ConfigPP = ConfigPP + { shortEmptyTag :: QName -> Bool + , prettify :: Bool + } + +-- | Default pretty orinting configuration. +-- * Always use abbreviate empty tags. +defaultConfigPP :: ConfigPP +defaultConfigPP = ConfigPP { shortEmptyTag = const True + , prettify = False + } + +-- | The predicate specifies for which empty tags we should use XML's +-- abbreviated notation . This is useful if we are working with +-- some XML-ish standards (such as certain versions of HTML) where some +-- empty tags should always be displayed in the form. +useShortEmptyTags :: (QName -> Bool) -> ConfigPP -> ConfigPP +useShortEmptyTags p c = c { shortEmptyTag = p } + + +-- | Specify if we should use extra white-space to make document more readable. +-- WARNING: This adds additional white-space to text elements, +-- and so it may change the meaning of the document. +useExtraWhiteSpace :: Bool -> ConfigPP -> ConfigPP +useExtraWhiteSpace p c = c { prettify = p } + +-- | A configuration that tries to make things pretty +-- (possibly at the cost of changing the semantics a bit +-- through adding white space.) +prettyConfigPP :: ConfigPP +prettyConfigPP = useExtraWhiteSpace True defaultConfigPP + + +-------------------------------------------------------------------------------- + + +-- | Pretty printing renders XML documents faithfully, +-- with the exception that whitespace may be added\/removed +-- in non-verbatim character data. +ppTopElement :: Element -> Text +ppTopElement = ppcTopElement prettyConfigPP + +-- | Pretty printing elements +ppElement :: Element -> Text +ppElement = ppcElement prettyConfigPP + +-- | Pretty printing content +ppContent :: Content -> Text +ppContent = ppcContent prettyConfigPP + +-- | Pretty printing renders XML documents faithfully, +-- with the exception that whitespace may be added\/removed +-- in non-verbatim character data. +ppcTopElement :: ConfigPP -> Element -> Text +ppcTopElement c e = T.unlines [xmlHeader,ppcElement c e] + +-- | Pretty printing elements +ppcElement :: ConfigPP -> Element -> Text +ppcElement c = TL.toStrict . toLazyText . ppElementS c mempty + +-- | Pretty printing content +ppcContent :: ConfigPP -> Content -> Text +ppcContent c = TL.toStrict . toLazyText . ppContentS c mempty + +ppcCData :: ConfigPP -> CData -> Text +ppcCData c = TL.toStrict . toLazyText . ppCDataS c mempty + +type Indent = Builder + +-- | Pretty printing content using ShowT +ppContentS :: ConfigPP -> Indent -> Content -> Builder +ppContentS c i x = case x of + Elem e -> ppElementS c i e + Text t -> ppCDataS c i t + CRef r -> showCRefS r + +ppElementS :: ConfigPP -> Indent -> Element -> Builder +ppElementS c i e = i <> tagStart (elName e) (elAttribs e) <> + (case elContent e of + [] | "?" `T.isPrefixOf` qName name -> fromText " ?>" + | shortEmptyTag c name -> fromText " />" + [Text t] -> singleton '>' <> ppCDataS c mempty t <> tagEnd name + cs -> singleton '>' <> nl <> + mconcat (map ((<> nl) . ppContentS c (sp <> i)) cs) <> + i <> tagEnd name + where (nl,sp) = if prettify c then ("\n"," ") else ("","") + ) + where name = elName e + +ppCDataS :: ConfigPP -> Indent -> CData -> Builder +ppCDataS c i t = i <> if cdVerbatim t /= CDataText || not (prettify c) + then showCDataS t + else foldr cons mempty (T.unpack (showCData t)) + where cons :: Char -> Builder -> Builder + cons '\n' ys = singleton '\n' <> i <> ys + cons y ys = singleton y <> ys + + + +-------------------------------------------------------------------------------- + +-- | Adds the header. +showTopElement :: Element -> Text +showTopElement c = xmlHeader <> showElement c + +showContent :: Content -> Text +showContent = ppcContent defaultConfigPP + +showElement :: Element -> Text +showElement = ppcElement defaultConfigPP + +showCData :: CData -> Text +showCData = ppcCData defaultConfigPP + +-- Note: crefs should not contain '&', ';', etc. +showCRefS :: Text -> Builder +showCRefS r = singleton '&' <> fromText r <> singleton ';' + +-- | Convert a text element to characters. +showCDataS :: CData -> Builder +showCDataS cd = + case cdVerbatim cd of + CDataText -> escStr (cdData cd) + CDataVerbatim -> fromText " escCData (cdData cd) <> + fromText "]]>" + CDataRaw -> fromText (cdData cd) + +-------------------------------------------------------------------------------- +escCData :: Text -> Builder +escCData t + | "]]>" `T.isPrefixOf` t = + fromText "]]]]>" <> fromText (T.drop 3 t) +escCData t + = case T.uncons t of + Nothing -> mempty + Just (c,t') -> singleton c <> escCData t' + +escChar :: Char -> Builder +escChar c = case c of + '<' -> fromText "<" + '>' -> fromText ">" + '&' -> fromText "&" + '"' -> fromText """ + -- we use ' instead of ' because IE apparently has difficulties + -- rendering ' in xhtml. + -- Reported by Rohan Drape . + '\'' -> fromText "'" + _ -> singleton c + + {- original xml-light version: + -- NOTE: We escape '\r' explicitly because otherwise they get lost + -- when parsed back in because of then end-of-line normalization rules. + _ | isPrint c || c == '\n' -> singleton c + | otherwise -> showText "&#" . showsT oc . singleton ';' + where oc = ord c + -} + +escStr :: Text -> Builder +escStr cs = if T.any needsEscape cs + then mconcat (map escChar (T.unpack cs)) + else fromText cs + where + needsEscape '<' = True + needsEscape '>' = True + needsEscape '&' = True + needsEscape '"' = True + needsEscape '\'' = True + needsEscape _ = False + +tagEnd :: QName -> Builder +tagEnd qn = fromText " showQName qn <> singleton '>' + +tagStart :: QName -> [Attr] -> Builder +tagStart qn as = singleton '<' <> showQName qn <> as_str + where as_str = if null as + then mempty + else mconcat (map showAttr as) + +showAttr :: Attr -> Builder +showAttr (Attr qn v) = singleton ' ' <> showQName qn <> + singleton '=' <> + singleton '"' <> escStr v <> singleton '"' + +showQName :: QName -> Builder +showQName q = + case qPrefix q of + Nothing -> fromText (qName q) + Just p -> fromText p <> singleton ':' <> fromText (qName q) diff --git a/src/Text/Pandoc/XMLParser.hs b/src/Text/Pandoc/XMLParser.hs deleted file mode 100644 index 8ad22a66a..000000000 --- a/src/Text/Pandoc/XMLParser.hs +++ /dev/null @@ -1,66 +0,0 @@ -{-# LANGUAGE OverloadedStrings #-} -{- | - Module : Text.Pandoc.XMLParser - Copyright : Copyright (C) 2021 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane - Stability : alpha - Portability : portable - -Bridge to allow using xml-conduit's parser with xml-light's types. --} -module Text.Pandoc.XMLParser - ( parseXMLElement - , parseXMLContents - , module Text.XML.Light.Types - ) where - -import qualified Control.Exception as E -import qualified Text.XML as Conduit -import Text.XML.Unresolved (InvalidEventStream(..)) -import qualified Text.XML.Light as Light -import Text.XML.Light.Types -import qualified Data.Text as T -import qualified Data.Text.Lazy as TL -import qualified Data.Map as M -import Data.Maybe (mapMaybe) - --- Drop in replacement for parseXMLDoc in xml-light. -parseXMLElement :: TL.Text -> Either T.Text Light.Element -parseXMLElement t = - elementToElement . Conduit.documentRoot <$> - either (Left . T.pack . E.displayException) Right - (Conduit.parseText Conduit.def{ Conduit.psRetainNamespaces = True } t) - -parseXMLContents :: TL.Text -> Either T.Text [Light.Content] -parseXMLContents t = - case Conduit.parseText Conduit.def{ Conduit.psRetainNamespaces = True } t of - Left e -> - case E.fromException e of - Just (ContentAfterRoot _) -> - elContent <$> parseXMLElement ("" <> t <> "") - _ -> Left . T.pack . E.displayException $ e - Right x -> Right [Light.Elem . elementToElement . Conduit.documentRoot $ x] - -elementToElement :: Conduit.Element -> Light.Element -elementToElement (Conduit.Element name attribMap nodes) = - Light.Element (nameToQname name) attrs (mapMaybe nodeToContent nodes) Nothing - where - attrs = map (\(n,v) -> Light.Attr (nameToQname n) (T.unpack v)) $ - M.toList attribMap - nameToQname (Conduit.Name localName mbns mbpref) = - case mbpref of - Nothing | "xmlns:" `T.isPrefixOf` localName -> - Light.QName (T.unpack $ T.drop 6 localName) (T.unpack <$> mbns) - (Just "xmlns") - _ -> Light.QName (T.unpack localName) (T.unpack <$> mbns) - (T.unpack <$> mbpref) - -nodeToContent :: Conduit.Node -> Maybe Light.Content -nodeToContent (Conduit.NodeElement el) = - Just (Light.Elem (elementToElement el)) -nodeToContent (Conduit.NodeContent t) = - Just (Light.Text (Light.CData Light.CDataText (T.unpack t) Nothing)) -nodeToContent _ = Nothing - diff --git a/test/Tests/Writers/OOXML.hs b/test/Tests/Writers/OOXML.hs index 376f02c55..c1e47622d 100644 --- a/test/Tests/Writers/OOXML.hs +++ b/test/Tests/Writers/OOXML.hs @@ -43,7 +43,8 @@ compareXMLBool _ _ = False displayDiff :: Content -> Content -> String displayDiff elemA elemB = - showDiff (1,1) $ getDiff (lines $ ppContent elemA) (lines $ ppContent elemB) + showDiff (1,1) + (getDiff (lines $ showContent elemA) (lines $ showContent elemB)) goldenArchive :: FilePath -> IO Archive goldenArchive fp = toArchive . BL.fromStrict <$> BS.readFile fp diff --git a/test/docx/golden/block_quotes.docx b/test/docx/golden/block_quotes.docx index d3b16d0f2..ed7d1165c 100644 Binary files a/test/docx/golden/block_quotes.docx and b/test/docx/golden/block_quotes.docx differ diff --git a/test/docx/golden/codeblock.docx b/test/docx/golden/codeblock.docx index 6293ef493..07ae75676 100644 Binary files a/test/docx/golden/codeblock.docx and b/test/docx/golden/codeblock.docx differ diff --git a/test/docx/golden/comments.docx b/test/docx/golden/comments.docx index 4205a1516..e5f034378 100644 Binary files a/test/docx/golden/comments.docx and b/test/docx/golden/comments.docx differ diff --git a/test/docx/golden/custom_style_no_reference.docx b/test/docx/golden/custom_style_no_reference.docx index adb3f23db..174942135 100644 Binary files a/test/docx/golden/custom_style_no_reference.docx and b/test/docx/golden/custom_style_no_reference.docx differ diff --git a/test/docx/golden/custom_style_preserve.docx b/test/docx/golden/custom_style_preserve.docx index 92c8137fe..b5c31a851 100644 Binary files a/test/docx/golden/custom_style_preserve.docx and b/test/docx/golden/custom_style_preserve.docx differ diff --git a/test/docx/golden/custom_style_reference.docx b/test/docx/golden/custom_style_reference.docx index f53470617..c42ca1b05 100644 Binary files a/test/docx/golden/custom_style_reference.docx and b/test/docx/golden/custom_style_reference.docx differ diff --git a/test/docx/golden/definition_list.docx b/test/docx/golden/definition_list.docx index d6af90a72..1cb4c1fd7 100644 Binary files a/test/docx/golden/definition_list.docx and b/test/docx/golden/definition_list.docx differ diff --git a/test/docx/golden/document-properties-short-desc.docx b/test/docx/golden/document-properties-short-desc.docx index e18dbe853..7122456ea 100644 Binary files a/test/docx/golden/document-properties-short-desc.docx and b/test/docx/golden/document-properties-short-desc.docx differ diff --git a/test/docx/golden/document-properties.docx b/test/docx/golden/document-properties.docx index 820299043..616ba0f81 100644 Binary files a/test/docx/golden/document-properties.docx and b/test/docx/golden/document-properties.docx differ diff --git a/test/docx/golden/headers.docx b/test/docx/golden/headers.docx index ae0f41d12..c30dcdee9 100644 Binary files a/test/docx/golden/headers.docx and b/test/docx/golden/headers.docx differ diff --git a/test/docx/golden/image.docx b/test/docx/golden/image.docx index 94cd35dfa..8a704b41e 100644 Binary files a/test/docx/golden/image.docx and b/test/docx/golden/image.docx differ diff --git a/test/docx/golden/inline_code.docx b/test/docx/golden/inline_code.docx index 879f2a25b..b1906c8c4 100644 Binary files a/test/docx/golden/inline_code.docx and b/test/docx/golden/inline_code.docx differ diff --git a/test/docx/golden/inline_formatting.docx b/test/docx/golden/inline_formatting.docx index 93f86478f..8adf1cf75 100644 Binary files a/test/docx/golden/inline_formatting.docx and b/test/docx/golden/inline_formatting.docx differ diff --git a/test/docx/golden/inline_images.docx b/test/docx/golden/inline_images.docx index 967d297f2..584117503 100644 Binary files a/test/docx/golden/inline_images.docx and b/test/docx/golden/inline_images.docx differ diff --git a/test/docx/golden/link_in_notes.docx b/test/docx/golden/link_in_notes.docx index c5614e2fa..8859fe55c 100644 Binary files a/test/docx/golden/link_in_notes.docx and b/test/docx/golden/link_in_notes.docx differ diff --git a/test/docx/golden/links.docx b/test/docx/golden/links.docx index 0f39a831f..b80f3b3ba 100644 Binary files a/test/docx/golden/links.docx and b/test/docx/golden/links.docx differ diff --git a/test/docx/golden/lists.docx b/test/docx/golden/lists.docx index 07046f223..35beed68a 100644 Binary files a/test/docx/golden/lists.docx and b/test/docx/golden/lists.docx differ diff --git a/test/docx/golden/lists_continuing.docx b/test/docx/golden/lists_continuing.docx index 3656618e6..2c29fd674 100644 Binary files a/test/docx/golden/lists_continuing.docx and b/test/docx/golden/lists_continuing.docx differ diff --git a/test/docx/golden/lists_multiple_initial.docx b/test/docx/golden/lists_multiple_initial.docx index 8798253d5..10a948886 100644 Binary files a/test/docx/golden/lists_multiple_initial.docx and b/test/docx/golden/lists_multiple_initial.docx differ diff --git a/test/docx/golden/lists_restarting.docx b/test/docx/golden/lists_restarting.docx index 0a24d1840..5b90e74a0 100644 Binary files a/test/docx/golden/lists_restarting.docx and b/test/docx/golden/lists_restarting.docx differ diff --git a/test/docx/golden/nested_anchors_in_header.docx b/test/docx/golden/nested_anchors_in_header.docx index 52bb7a217..cc81b46d1 100644 Binary files a/test/docx/golden/nested_anchors_in_header.docx and b/test/docx/golden/nested_anchors_in_header.docx differ diff --git a/test/docx/golden/notes.docx b/test/docx/golden/notes.docx index 182c06c64..1394dc442 100644 Binary files a/test/docx/golden/notes.docx and b/test/docx/golden/notes.docx differ diff --git a/test/docx/golden/raw-blocks.docx b/test/docx/golden/raw-blocks.docx index 7b69a56a3..0d1688694 100644 Binary files a/test/docx/golden/raw-blocks.docx and b/test/docx/golden/raw-blocks.docx differ diff --git a/test/docx/golden/raw-bookmarks.docx b/test/docx/golden/raw-bookmarks.docx index 3d3a35701..be1caef2d 100644 Binary files a/test/docx/golden/raw-bookmarks.docx and b/test/docx/golden/raw-bookmarks.docx differ diff --git a/test/docx/golden/table_one_row.docx b/test/docx/golden/table_one_row.docx index 5ae37b406..a1d2323c2 100644 Binary files a/test/docx/golden/table_one_row.docx and b/test/docx/golden/table_one_row.docx differ diff --git a/test/docx/golden/table_with_list_cell.docx b/test/docx/golden/table_with_list_cell.docx index c29aa6716..2f3a831a7 100644 Binary files a/test/docx/golden/table_with_list_cell.docx and b/test/docx/golden/table_with_list_cell.docx differ diff --git a/test/docx/golden/tables.docx b/test/docx/golden/tables.docx index 664493246..af066107c 100644 Binary files a/test/docx/golden/tables.docx and b/test/docx/golden/tables.docx differ diff --git a/test/docx/golden/track_changes_deletion.docx b/test/docx/golden/track_changes_deletion.docx index b6d15340e..9cc7a075f 100644 Binary files a/test/docx/golden/track_changes_deletion.docx and b/test/docx/golden/track_changes_deletion.docx differ diff --git a/test/docx/golden/track_changes_insertion.docx b/test/docx/golden/track_changes_insertion.docx index f8e1092d2..f8b8dcfde 100644 Binary files a/test/docx/golden/track_changes_insertion.docx and b/test/docx/golden/track_changes_insertion.docx differ diff --git a/test/docx/golden/track_changes_move.docx b/test/docx/golden/track_changes_move.docx index b4cda82f2..1c3baf0bf 100644 Binary files a/test/docx/golden/track_changes_move.docx and b/test/docx/golden/track_changes_move.docx differ diff --git a/test/docx/golden/track_changes_scrubbed_metadata.docx b/test/docx/golden/track_changes_scrubbed_metadata.docx index ee222efa0..28686970d 100644 Binary files a/test/docx/golden/track_changes_scrubbed_metadata.docx and b/test/docx/golden/track_changes_scrubbed_metadata.docx differ diff --git a/test/docx/golden/unicode.docx b/test/docx/golden/unicode.docx index c6f8d9c96..7051cefbd 100644 Binary files a/test/docx/golden/unicode.docx and b/test/docx/golden/unicode.docx differ diff --git a/test/docx/golden/verbatim_subsuper.docx b/test/docx/golden/verbatim_subsuper.docx index ea8146690..9df631640 100644 Binary files a/test/docx/golden/verbatim_subsuper.docx and b/test/docx/golden/verbatim_subsuper.docx differ diff --git a/test/pptx/code-custom.pptx b/test/pptx/code-custom.pptx index 58070eb3f..5e9c2c630 100644 Binary files a/test/pptx/code-custom.pptx and b/test/pptx/code-custom.pptx differ diff --git a/test/pptx/code-custom_templated.pptx b/test/pptx/code-custom_templated.pptx index db9b7e371..15232c32d 100644 Binary files a/test/pptx/code-custom_templated.pptx and b/test/pptx/code-custom_templated.pptx differ diff --git a/test/pptx/code.pptx b/test/pptx/code.pptx index c7b1ed7d5..aab0cc6f5 100644 Binary files a/test/pptx/code.pptx and b/test/pptx/code.pptx differ diff --git a/test/pptx/code_templated.pptx b/test/pptx/code_templated.pptx index 6944d92bf..fe5b675f3 100644 Binary files a/test/pptx/code_templated.pptx and b/test/pptx/code_templated.pptx differ diff --git a/test/pptx/document-properties-short-desc.pptx b/test/pptx/document-properties-short-desc.pptx index ae0d28429..de5e68151 100644 Binary files a/test/pptx/document-properties-short-desc.pptx and b/test/pptx/document-properties-short-desc.pptx differ diff --git a/test/pptx/document-properties-short-desc_templated.pptx b/test/pptx/document-properties-short-desc_templated.pptx index 37c74c69a..89e6fbdf2 100644 Binary files a/test/pptx/document-properties-short-desc_templated.pptx and b/test/pptx/document-properties-short-desc_templated.pptx differ diff --git a/test/pptx/document-properties.pptx b/test/pptx/document-properties.pptx index 324e443a1..6bcbd1b9c 100644 Binary files a/test/pptx/document-properties.pptx and b/test/pptx/document-properties.pptx differ diff --git a/test/pptx/document-properties_templated.pptx b/test/pptx/document-properties_templated.pptx index c81b983e3..79d48560b 100644 Binary files a/test/pptx/document-properties_templated.pptx and b/test/pptx/document-properties_templated.pptx differ diff --git a/test/pptx/endnotes.pptx b/test/pptx/endnotes.pptx index 30ce33db6..9d46036fe 100644 Binary files a/test/pptx/endnotes.pptx and b/test/pptx/endnotes.pptx differ diff --git a/test/pptx/endnotes_templated.pptx b/test/pptx/endnotes_templated.pptx index d6c604968..54ec7f305 100644 Binary files a/test/pptx/endnotes_templated.pptx and b/test/pptx/endnotes_templated.pptx differ diff --git a/test/pptx/endnotes_toc.pptx b/test/pptx/endnotes_toc.pptx index 000e17ecd..a028b346f 100644 Binary files a/test/pptx/endnotes_toc.pptx and b/test/pptx/endnotes_toc.pptx differ diff --git a/test/pptx/endnotes_toc_templated.pptx b/test/pptx/endnotes_toc_templated.pptx index fdcd2e29b..1158c16fc 100644 Binary files a/test/pptx/endnotes_toc_templated.pptx and b/test/pptx/endnotes_toc_templated.pptx differ diff --git a/test/pptx/images.pptx b/test/pptx/images.pptx index e73126376..670a825de 100644 Binary files a/test/pptx/images.pptx and b/test/pptx/images.pptx differ diff --git a/test/pptx/images_templated.pptx b/test/pptx/images_templated.pptx index e3f968e9e..6d297ef11 100644 Binary files a/test/pptx/images_templated.pptx and b/test/pptx/images_templated.pptx differ diff --git a/test/pptx/inline_formatting.pptx b/test/pptx/inline_formatting.pptx index eadb9372e..473b9498d 100644 Binary files a/test/pptx/inline_formatting.pptx and b/test/pptx/inline_formatting.pptx differ diff --git a/test/pptx/inline_formatting_templated.pptx b/test/pptx/inline_formatting_templated.pptx index 8ca6bab2b..2cdf54474 100644 Binary files a/test/pptx/inline_formatting_templated.pptx and b/test/pptx/inline_formatting_templated.pptx differ diff --git a/test/pptx/lists.pptx b/test/pptx/lists.pptx index ae188ee68..ffc2eb9f7 100644 Binary files a/test/pptx/lists.pptx and b/test/pptx/lists.pptx differ diff --git a/test/pptx/lists_templated.pptx b/test/pptx/lists_templated.pptx index 60301fa50..676954cb8 100644 Binary files a/test/pptx/lists_templated.pptx and b/test/pptx/lists_templated.pptx differ diff --git a/test/pptx/raw_ooxml.pptx b/test/pptx/raw_ooxml.pptx index 17124a50d..29164af15 100644 Binary files a/test/pptx/raw_ooxml.pptx and b/test/pptx/raw_ooxml.pptx differ diff --git a/test/pptx/raw_ooxml_templated.pptx b/test/pptx/raw_ooxml_templated.pptx index 19ae7dd4e..1742b3296 100644 Binary files a/test/pptx/raw_ooxml_templated.pptx and b/test/pptx/raw_ooxml_templated.pptx differ diff --git a/test/pptx/remove_empty_slides.pptx b/test/pptx/remove_empty_slides.pptx index b650b7585..c6df8e18e 100644 Binary files a/test/pptx/remove_empty_slides.pptx and b/test/pptx/remove_empty_slides.pptx differ diff --git a/test/pptx/remove_empty_slides_templated.pptx b/test/pptx/remove_empty_slides_templated.pptx index 0ab029614..cf6e52eef 100644 Binary files a/test/pptx/remove_empty_slides_templated.pptx and b/test/pptx/remove_empty_slides_templated.pptx differ diff --git a/test/pptx/slide_breaks.pptx b/test/pptx/slide_breaks.pptx index 2a6e35080..e06d9079d 100644 Binary files a/test/pptx/slide_breaks.pptx and b/test/pptx/slide_breaks.pptx differ diff --git a/test/pptx/slide_breaks_slide_level_1.pptx b/test/pptx/slide_breaks_slide_level_1.pptx index a7bcf6a4b..449339778 100644 Binary files a/test/pptx/slide_breaks_slide_level_1.pptx and b/test/pptx/slide_breaks_slide_level_1.pptx differ diff --git a/test/pptx/slide_breaks_slide_level_1_templated.pptx b/test/pptx/slide_breaks_slide_level_1_templated.pptx index 21b018c25..258098082 100644 Binary files a/test/pptx/slide_breaks_slide_level_1_templated.pptx and b/test/pptx/slide_breaks_slide_level_1_templated.pptx differ diff --git a/test/pptx/slide_breaks_templated.pptx b/test/pptx/slide_breaks_templated.pptx index 4ec4772a4..2f0213919 100644 Binary files a/test/pptx/slide_breaks_templated.pptx and b/test/pptx/slide_breaks_templated.pptx differ diff --git a/test/pptx/slide_breaks_toc.pptx b/test/pptx/slide_breaks_toc.pptx index 5983657b6..9dbfa41a0 100644 Binary files a/test/pptx/slide_breaks_toc.pptx and b/test/pptx/slide_breaks_toc.pptx differ diff --git a/test/pptx/slide_breaks_toc_templated.pptx b/test/pptx/slide_breaks_toc_templated.pptx index dd54c7082..f288dde14 100644 Binary files a/test/pptx/slide_breaks_toc_templated.pptx and b/test/pptx/slide_breaks_toc_templated.pptx differ diff --git a/test/pptx/speaker_notes.pptx b/test/pptx/speaker_notes.pptx index b3e5ed5b9..0ab1302da 100644 Binary files a/test/pptx/speaker_notes.pptx and b/test/pptx/speaker_notes.pptx differ diff --git a/test/pptx/speaker_notes_after_metadata.pptx b/test/pptx/speaker_notes_after_metadata.pptx index 1078854bb..6343bffe4 100644 Binary files a/test/pptx/speaker_notes_after_metadata.pptx and b/test/pptx/speaker_notes_after_metadata.pptx differ diff --git a/test/pptx/speaker_notes_after_metadata_templated.pptx b/test/pptx/speaker_notes_after_metadata_templated.pptx index 5116c6c4e..5d4465f64 100644 Binary files a/test/pptx/speaker_notes_after_metadata_templated.pptx and b/test/pptx/speaker_notes_after_metadata_templated.pptx differ diff --git a/test/pptx/speaker_notes_afterheader.pptx b/test/pptx/speaker_notes_afterheader.pptx index 0c8e49bd9..d581681aa 100644 Binary files a/test/pptx/speaker_notes_afterheader.pptx and b/test/pptx/speaker_notes_afterheader.pptx differ diff --git a/test/pptx/speaker_notes_afterheader_templated.pptx b/test/pptx/speaker_notes_afterheader_templated.pptx index 68695939d..c922df3a8 100644 Binary files a/test/pptx/speaker_notes_afterheader_templated.pptx and b/test/pptx/speaker_notes_afterheader_templated.pptx differ diff --git a/test/pptx/speaker_notes_afterseps.pptx b/test/pptx/speaker_notes_afterseps.pptx index 7ed9b946d..13f564bf0 100644 Binary files a/test/pptx/speaker_notes_afterseps.pptx and b/test/pptx/speaker_notes_afterseps.pptx differ diff --git a/test/pptx/speaker_notes_afterseps_templated.pptx b/test/pptx/speaker_notes_afterseps_templated.pptx index 79fc82345..f0b302738 100644 Binary files a/test/pptx/speaker_notes_afterseps_templated.pptx and b/test/pptx/speaker_notes_afterseps_templated.pptx differ diff --git a/test/pptx/speaker_notes_templated.pptx b/test/pptx/speaker_notes_templated.pptx index 9f943c279..2d8d00242 100644 Binary files a/test/pptx/speaker_notes_templated.pptx and b/test/pptx/speaker_notes_templated.pptx differ diff --git a/test/pptx/start_numbering_at.pptx b/test/pptx/start_numbering_at.pptx index ac72d8ced..4320128b3 100644 Binary files a/test/pptx/start_numbering_at.pptx and b/test/pptx/start_numbering_at.pptx differ diff --git a/test/pptx/start_numbering_at_templated.pptx b/test/pptx/start_numbering_at_templated.pptx index 15c7b5469..0a3f6e56d 100644 Binary files a/test/pptx/start_numbering_at_templated.pptx and b/test/pptx/start_numbering_at_templated.pptx differ diff --git a/test/pptx/tables.pptx b/test/pptx/tables.pptx index 926c5e699..e41219844 100644 Binary files a/test/pptx/tables.pptx and b/test/pptx/tables.pptx differ diff --git a/test/pptx/tables_templated.pptx b/test/pptx/tables_templated.pptx index a37e72d2c..82b8aa13d 100644 Binary files a/test/pptx/tables_templated.pptx and b/test/pptx/tables_templated.pptx differ diff --git a/test/pptx/two_column.pptx b/test/pptx/two_column.pptx index 7f86533fe..270a7eeac 100644 Binary files a/test/pptx/two_column.pptx and b/test/pptx/two_column.pptx differ diff --git a/test/pptx/two_column_templated.pptx b/test/pptx/two_column_templated.pptx index 89e3db0ab..44985d701 100644 Binary files a/test/pptx/two_column_templated.pptx and b/test/pptx/two_column_templated.pptx differ -- cgit v1.2.3 From 73add0578989e1da6e9cd1de68e2e4142f789188 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 17 Feb 2021 09:54:39 -0800 Subject: Docx reader: use Map instead of list for Namespaces. This gives a speedup of about 5-10%. The reader is now approximately twice as fast as in the last release. --- src/Text/Pandoc/Readers/Docx/Parse.hs | 14 +++++++------- src/Text/Pandoc/Readers/Docx/Util.hs | 26 +++++++++++++------------- 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index c76f3c171..f8ed248d7 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -396,9 +396,9 @@ archiveToNotes zf = >>= parseXMLFromEntry enElem = findEntryByPath "word/endnotes.xml" zf >>= parseXMLFromEntry - fn_namespaces = maybe [] elemToNameSpaces fnElem - en_namespaces = maybe [] elemToNameSpaces enElem - ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces + fn_namespaces = maybe mempty elemToNameSpaces fnElem + en_namespaces = maybe mempty elemToNameSpaces enElem + ns = M.union fn_namespaces en_namespaces fn = fnElem >>= elemToNotes ns "footnote" . walkDocument ns en = enElem >>= elemToNotes ns "endnote" . walkDocument ns in @@ -408,7 +408,7 @@ archiveToComments :: Archive -> Comments archiveToComments zf = let cmtsElem = findEntryByPath "word/comments.xml" zf >>= parseXMLFromEntry - cmts_namespaces = maybe [] elemToNameSpaces cmtsElem + cmts_namespaces = maybe mempty elemToNameSpaces cmtsElem cmts = elemToComments cmts_namespaces . walkDocument cmts_namespaces <$> cmtsElem in @@ -518,7 +518,7 @@ levelElemToLevel _ _ = Nothing archiveToNumbering' :: Archive -> Maybe Numbering archiveToNumbering' zf = case findEntryByPath "word/numbering.xml" zf of - Nothing -> Just $ Numbering [] [] [] + Nothing -> Just $ Numbering mempty [] [] Just entry -> do numberingElem <- parseXMLFromEntry entry let namespaces = elemToNameSpaces numberingElem @@ -530,7 +530,7 @@ archiveToNumbering' zf = archiveToNumbering :: Archive -> Numbering archiveToNumbering archive = - fromMaybe (Numbering [] [] []) (archiveToNumbering' archive) + fromMaybe (Numbering mempty [] []) (archiveToNumbering' archive) elemToNotes :: NameSpaces -> Text -> Element -> Maybe (M.Map T.Text Element) elemToNotes ns notetype element @@ -875,7 +875,7 @@ childElemToRun ns element = let (title, alt) = getTitleAndAlt ns element a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) + >>= findAttr (QName "embed" (M.lookup "r" ns) (Just "r")) in case drawing of Just s -> expandDrawingId s >>= diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index 21df03d9e..ac331cba6 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -22,42 +22,42 @@ module Text.Pandoc.Readers.Docx.Util ( , findAttrByName ) where -import Data.Maybe (mapMaybe) import qualified Data.Text as T import Data.Text (Text) import Text.Pandoc.XML.Light +import qualified Data.Map as M -type NameSpaces = [(Text, Text)] +type NameSpaces = M.Map Text Text elemToNameSpaces :: Element -> NameSpaces -elemToNameSpaces = mapMaybe attrToNSPair . elAttribs - -attrToNSPair :: Attr -> Maybe (Text, Text) -attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) -attrToNSPair _ = Nothing +elemToNameSpaces = foldr (\(Attr qn val) -> + case qn of + QName s _ (Just "xmlns") -> M.insert s val + _ -> id) mempty . elAttribs elemName :: NameSpaces -> Text -> Text -> QName elemName ns prefix name = - QName name (lookup prefix ns) (if T.null prefix then Nothing else Just prefix) + QName name (M.lookup prefix ns) + (if T.null prefix then Nothing else Just prefix) isElem :: NameSpaces -> Text -> Text -> Element -> Bool isElem ns prefix name element = - let ns' = ns ++ elemToNameSpaces element + let ns' = ns <> elemToNameSpaces element in qName (elName element) == name && - qURI (elName element) == lookup prefix ns' + qURI (elName element) == M.lookup prefix ns' findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element findChildByName ns pref name el = - let ns' = ns ++ elemToNameSpaces el + let ns' = ns <> elemToNameSpaces el in findChild (elemName ns' pref name) el findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element] findChildrenByName ns pref name el = - let ns' = ns ++ elemToNameSpaces el + let ns' = ns <> elemToNameSpaces el in findChildren (elemName ns' pref name) el findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text findAttrByName ns pref name el = - let ns' = ns ++ elemToNameSpaces el + let ns' = ns <> elemToNameSpaces el in findAttr (elemName ns' pref name) el -- cgit v1.2.3 From 24191a2a278c0dec30bacd66b78cbb8cc8d91324 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 15 Mar 2021 10:37:35 -0700 Subject: Use foldl' instead of foldl everywhere. --- src/Text/Pandoc/App/CommandLineOptions.hs | 4 ++-- src/Text/Pandoc/Citeproc/Locator.hs | 3 ++- src/Text/Pandoc/Class/PandocMonad.hs | 3 ++- src/Text/Pandoc/Extensions.hs | 3 ++- src/Text/Pandoc/Lua/Filter.hs | 3 ++- src/Text/Pandoc/Readers/Docx/Combine.hs | 4 ++-- src/Text/Pandoc/Readers/HTML/TagCategories.hs | 1 + src/Text/Pandoc/Readers/Markdown.hs | 6 +++--- src/Text/Pandoc/Readers/Odt/Arrows/State.hs | 4 ++-- src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs | 4 ++-- src/Text/Pandoc/Readers/Odt/StyleReader.hs | 4 ++-- src/Text/Pandoc/Readers/Textile.hs | 4 ++-- src/Text/Pandoc/Shared.hs | 4 ++-- src/Text/Pandoc/Writers/HTML.hs | 8 ++++---- src/Text/Pandoc/Writers/RST.hs | 4 ++-- src/Text/Pandoc/Writers/Texinfo.hs | 4 ++-- 16 files changed, 34 insertions(+), 29 deletions(-) (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/src/Text/Pandoc/App/CommandLineOptions.hs b/src/Text/Pandoc/App/CommandLineOptions.hs index b4483f756..a6df12715 100644 --- a/src/Text/Pandoc/App/CommandLineOptions.hs +++ b/src/Text/Pandoc/App/CommandLineOptions.hs @@ -31,7 +31,7 @@ import Data.Aeson.Encode.Pretty (encodePretty', Config(..), keyOrder, defConfig, Indent(..), NumberFormat(..)) import Data.Bifunctor (second) import Data.Char (toLower) -import Data.List (intercalate, sort) +import Data.List (intercalate, sort, foldl') #ifdef _WINDOWS #if MIN_VERSION_base(4,12,0) import Data.List (isPrefixOf) @@ -93,7 +93,7 @@ parseOptionsFromArgs options' defaults prg rawArgs = do ("Try " ++ prg ++ " --help for more information.") -- thread option data structure through all supplied option actions - opts <- foldl (>>=) (return defaults) actions + opts <- foldl' (>>=) (return defaults) actions let mbArgs = case args of [] -> Nothing xs -> Just xs diff --git a/src/Text/Pandoc/Citeproc/Locator.hs b/src/Text/Pandoc/Citeproc/Locator.hs index dba762c02..44416ca12 100644 --- a/src/Text/Pandoc/Citeproc/Locator.hs +++ b/src/Text/Pandoc/Citeproc/Locator.hs @@ -7,6 +7,7 @@ where import Citeproc.Types import Data.Text (Text) import qualified Data.Text as T +import Data.List (foldl') import Text.Parsec import Text.Pandoc.Definition import Text.Pandoc.Parsing (romanNumeral) @@ -139,7 +140,7 @@ pBalancedBraces braces p = try $ do where except = notFollowedBy pBraces >> p -- outer and inner - surround = foldl (\a (open, close) -> sur open close except <|> a) + surround = foldl' (\a (open, close) -> sur open close except <|> a) except braces diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs index 86c8de79e..293a822a0 100644 --- a/src/Text/Pandoc/Class/PandocMonad.hs +++ b/src/Text/Pandoc/Class/PandocMonad.hs @@ -59,6 +59,7 @@ import Control.Monad.Except (MonadError (catchError, throwError), MonadTrans, lift, when) import Data.Digest.Pure.SHA (sha1, showDigest) import Data.Maybe (fromMaybe) +import Data.List (foldl') import Data.Time (UTCTime) import Data.Time.Clock.POSIX (POSIXTime, utcTimeToPOSIXSeconds, posixSecondsToUTCTime) @@ -612,7 +613,7 @@ checkExistence fn = do -- | Canonicalizes a file path by removing redundant @.@ and @..@. makeCanonical :: FilePath -> FilePath makeCanonical = Posix.joinPath . transformPathParts . splitDirectories - where transformPathParts = reverse . foldl go [] + where transformPathParts = reverse . foldl' go [] go as "." = as go (_:as) ".." = as go as x = x : as diff --git a/src/Text/Pandoc/Extensions.hs b/src/Text/Pandoc/Extensions.hs index 3b96f9e04..266a09e3c 100644 --- a/src/Text/Pandoc/Extensions.hs +++ b/src/Text/Pandoc/Extensions.hs @@ -34,6 +34,7 @@ module Text.Pandoc.Extensions ( Extension(..) where import Data.Bits (clearBit, setBit, testBit, (.|.)) import Data.Data (Data) +import Data.List (foldl') import qualified Data.Text as T import Data.Typeable (Typeable) import GHC.Generics (Generic) @@ -593,7 +594,7 @@ parseFormatSpec :: T.Text parseFormatSpec = parse formatSpec "" where formatSpec = do name <- formatName - (extsToEnable, extsToDisable) <- foldl (flip ($)) ([],[]) <$> + (extsToEnable, extsToDisable) <- foldl' (flip ($)) ([],[]) <$> many extMod return (T.pack name, reverse extsToEnable, reverse extsToDisable) formatName = many1 $ noneOf "-+" diff --git a/src/Text/Pandoc/Lua/Filter.hs b/src/Text/Pandoc/Lua/Filter.hs index bffe01a34..90967f295 100644 --- a/src/Text/Pandoc/Lua/Filter.hs +++ b/src/Text/Pandoc/Lua/Filter.hs @@ -22,6 +22,7 @@ import Control.Monad.Catch (finally, try) import Data.Data (Data, DataType, dataTypeConstrs, dataTypeName, dataTypeOf, showConstr, toConstr, tyconUQname) import Data.Foldable (foldrM) +import Data.List (foldl') import Data.Map (Map) import Data.Maybe (fromMaybe) import Foreign.Lua (Lua, Peekable, Pushable, StackIndex) @@ -204,7 +205,7 @@ walkMeta lf (Pandoc m bs) = do walkPandoc :: LuaFilter -> Pandoc -> Lua Pandoc walkPandoc (LuaFilter fnMap) = - case foldl mplus Nothing (map (`Map.lookup` fnMap) pandocFilterNames) of + case foldl' mplus Nothing (map (`Map.lookup` fnMap) pandocFilterNames) of Just fn -> \x -> runFilterFunction fn x *> singleElement x Nothing -> return diff --git a/src/Text/Pandoc/Readers/Docx/Combine.hs b/src/Text/Pandoc/Readers/Docx/Combine.hs index bcf26c4a3..7c6d01769 100644 --- a/src/Text/Pandoc/Readers/Docx/Combine.hs +++ b/src/Text/Pandoc/Readers/Docx/Combine.hs @@ -182,7 +182,7 @@ isAttrModifier _ = False smushInlines :: [Inlines] -> Inlines smushInlines xs = combineInlines xs' mempty - where xs' = foldl combineInlines mempty xs + where xs' = foldl' combineInlines mempty xs smushBlocks :: [Blocks] -> Blocks -smushBlocks xs = foldl combineBlocks mempty xs +smushBlocks xs = foldl' combineBlocks mempty xs diff --git a/src/Text/Pandoc/Readers/HTML/TagCategories.hs b/src/Text/Pandoc/Readers/HTML/TagCategories.hs index b7bd40fee..67aba1cb1 100644 --- a/src/Text/Pandoc/Readers/HTML/TagCategories.hs +++ b/src/Text/Pandoc/Readers/HTML/TagCategories.hs @@ -23,6 +23,7 @@ where import Data.Set (Set, fromList, unions) import Data.Text (Text) +import Data.List (foldl') eitherBlockOrInline :: Set Text eitherBlockOrInline = fromList diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 34edbcc17..dc94fc2d6 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -21,7 +21,7 @@ module Text.Pandoc.Readers.Markdown ( import Control.Monad import Control.Monad.Except (throwError) import Data.Char (isAlphaNum, isPunctuation, isSpace) -import Data.List (transpose, elemIndex, sortOn) +import Data.List (transpose, elemIndex, sortOn, foldl') import qualified Data.Map as M import Data.Maybe import qualified Data.Set as Set @@ -357,7 +357,7 @@ referenceKey = try $ do addKvs <- option [] $ guardEnabled Ext_mmd_link_attributes >> many (try $ spnl >> keyValAttr) blanklines - let attr' = extractIdClass $ foldl (\x f -> f x) attr addKvs + let attr' = extractIdClass $ foldl' (\x f -> f x) attr addKvs target = (escapeURI $ trimr src, tit) st <- getState let oldkeys = stateKeys st @@ -613,7 +613,7 @@ attributes = try $ do spnl attrs <- many (attribute <* spnl) char '}' - return $ foldl (\x f -> f x) nullAttr attrs + return $ foldl' (\x f -> f x) nullAttr attrs attribute :: PandocMonad m => MarkdownParser m (Attr -> Attr) attribute = identifierAttr <|> classAttr <|> keyValAttr <|> specialAttr diff --git a/src/Text/Pandoc/Readers/Odt/Arrows/State.hs b/src/Text/Pandoc/Readers/Odt/Arrows/State.hs index 93c6b5e79..96515bf56 100644 --- a/src/Text/Pandoc/Readers/Odt/Arrows/State.hs +++ b/src/Text/Pandoc/Readers/Odt/Arrows/State.hs @@ -22,7 +22,7 @@ module Text.Pandoc.Readers.Odt.Arrows.State where import Control.Arrow import qualified Control.Category as Cat import Control.Monad - +import Data.List (foldl') import Text.Pandoc.Readers.Odt.Arrows.Utils import Text.Pandoc.Readers.Odt.Generic.Fallible @@ -122,7 +122,7 @@ iterateS a = ArrowState $ \(s,f) -> foldr a' (s,mzero) f iterateSL :: (Foldable f, MonadPlus m) => ArrowState s x y -> ArrowState s (f x) (m y) -iterateSL a = ArrowState $ \(s,f) -> foldl a' (s,mzero) f +iterateSL a = ArrowState $ \(s,f) -> foldl' a' (s,mzero) f where a' (s',m) x = second (mplus m.return) $ runArrowState a (s',x) diff --git a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs index 0d921e23b..341903046 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs @@ -64,12 +64,12 @@ import qualified Data.Map as M import Data.Text (Text) import Data.Default import Data.Maybe +import Data.List (foldl') import qualified Text.Pandoc.XML.Light as XML import Text.Pandoc.Readers.Odt.Arrows.State import Text.Pandoc.Readers.Odt.Arrows.Utils - import Text.Pandoc.Readers.Odt.Generic.Namespaces import Text.Pandoc.Readers.Odt.Generic.Utils import Text.Pandoc.Readers.Odt.Generic.Fallible @@ -293,7 +293,7 @@ readNSattributes = fromState $ \state -> maybe (state, failEmpty ) => XMLConverterState nsID extraState -> Maybe (XMLConverterState nsID extraState) extractNSAttrs startState - = foldl (\state d -> state >>= addNS d) + = foldl' (\state d -> state >>= addNS d) (Just startState) nsAttribs where nsAttribs = mapMaybe readNSattr (XML.elAttribs element) diff --git a/src/Text/Pandoc/Readers/Odt/StyleReader.hs b/src/Text/Pandoc/Readers/Odt/StyleReader.hs index 5e10f896c..b722aa07d 100644 --- a/src/Text/Pandoc/Readers/Odt/StyleReader.hs +++ b/src/Text/Pandoc/Readers/Odt/StyleReader.hs @@ -44,7 +44,7 @@ import Control.Arrow import Data.Default import qualified Data.Foldable as F -import Data.List (unfoldr) +import Data.List (unfoldr, foldl') import qualified Data.Map as M import Data.Maybe import Data.Text (Text) @@ -120,7 +120,7 @@ fontPitchReader = executeInSub NsOffice "font-face-decls" ( &&& lookupDefaultingAttr NsStyle "font-pitch" )) - >>?^ ( M.fromList . foldl accumLegalPitches [] ) + >>?^ ( M.fromList . foldl' accumLegalPitches [] ) ) `ifFailedDo` returnV (Right M.empty) where accumLegalPitches ls (Nothing,_) = ls accumLegalPitches ls (Just n,p) = (n,p):ls diff --git a/src/Text/Pandoc/Readers/Textile.hs b/src/Text/Pandoc/Readers/Textile.hs index 860da2dc3..99238c7f0 100644 --- a/src/Text/Pandoc/Readers/Textile.hs +++ b/src/Text/Pandoc/Readers/Textile.hs @@ -38,7 +38,7 @@ module Text.Pandoc.Readers.Textile ( readTextile) where import Control.Monad (guard, liftM) import Control.Monad.Except (throwError) import Data.Char (digitToInt, isUpper) -import Data.List (intersperse, transpose) +import Data.List (intersperse, transpose, foldl') import Data.Text (Text) import qualified Data.Text as T import Text.HTML.TagSoup (Tag (..), fromAttrib) @@ -627,7 +627,7 @@ code2 = do -- | Html / CSS attributes attributes :: PandocMonad m => ParserT Text ParserState m Attr -attributes = foldl (flip ($)) ("",[],[]) <$> +attributes = foldl' (flip ($)) ("",[],[]) <$> try (do special <- option id specialAttribute attrs <- many attribute return (special : attrs)) diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index d11ad13f5..0ce9396b3 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -109,7 +109,7 @@ import qualified Data.Bifunctor as Bifunctor import Data.Char (isAlpha, isLower, isSpace, isUpper, toLower, isAlphaNum, generalCategory, GeneralCategory(NonSpacingMark, SpacingCombiningMark, EnclosingMark, ConnectorPunctuation)) -import Data.List (find, intercalate, intersperse, sortOn) +import Data.List (find, intercalate, intersperse, sortOn, foldl') import qualified Data.Map as M import Data.Maybe (mapMaybe, fromMaybe) import Data.Monoid (Any (..)) @@ -840,7 +840,7 @@ mapLeft = Bifunctor.first -- > collapseFilePath "parent/foo/.." == "parent" -- > collapseFilePath "/parent/foo/../../bar" == "/bar" collapseFilePath :: FilePath -> FilePath -collapseFilePath = Posix.joinPath . reverse . foldl go [] . splitDirectories +collapseFilePath = Posix.joinPath . reverse . foldl' go [] . splitDirectories where go rs "." = rs go r@(p:rs) ".." = case p of diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index 2f33cd467..332de1545 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -30,7 +30,7 @@ module Text.Pandoc.Writers.HTML ( ) where import Control.Monad.State.Strict import Data.Char (ord) -import Data.List (intercalate, intersperse, partition, delete, (\\)) +import Data.List (intercalate, intersperse, partition, delete, (\\), foldl') import Data.List.NonEmpty (NonEmpty((:|))) import Data.Maybe (fromMaybe, isJust, isNothing) import qualified Data.Set as Set @@ -544,7 +544,7 @@ tagWithAttributes opts html5 selfClosing tagname attr = addAttrs :: PandocMonad m => WriterOptions -> Attr -> Html -> StateT WriterState m Html -addAttrs opts attr h = foldl (!) h <$> attrsToHtml opts attr +addAttrs opts attr h = foldl' (!) h <$> attrsToHtml opts attr toAttrs :: PandocMonad m => [(Text, Text)] -> StateT WriterState m [Attribute] @@ -926,7 +926,7 @@ blockToHtml opts (OrderedList (startnum, numstyle, _) lst) = do numstyle'] else []) l <- ordList opts contents - return $ foldl (!) l attribs + return $ foldl' (!) l attribs blockToHtml opts (DefinitionList lst) = do contents <- mapM (\(term, defs) -> do term' <- liftM H.dt $ inlineListToHtml opts term @@ -1407,7 +1407,7 @@ inlineToHtml opts inline = do Just "audio" -> mediaTag H5.audio "Audio" Just _ -> (H5.embed, []) _ -> imageTag - return $ foldl (!) tag $ attributes ++ specAttrs + return $ foldl' (!) tag $ attributes ++ specAttrs -- note: null title included, as in Markdown.pl (Note contents) -> do notes <- gets stNotes diff --git a/src/Text/Pandoc/Writers/RST.hs b/src/Text/Pandoc/Writers/RST.hs index d01e13db4..54d042332 100644 --- a/src/Text/Pandoc/Writers/RST.hs +++ b/src/Text/Pandoc/Writers/RST.hs @@ -16,7 +16,7 @@ reStructuredText: module Text.Pandoc.Writers.RST ( writeRST, flatten ) where import Control.Monad.State.Strict import Data.Char (isSpace) -import Data.List (transpose, intersperse) +import Data.List (transpose, intersperse, foldl') import Data.Maybe (fromMaybe) import qualified Data.Text as T import Data.Text (Text) @@ -509,7 +509,7 @@ flatten outer | null contents = [outer] | otherwise = combineAll contents where contents = dropInlineParent outer - combineAll = foldl combine [] + combineAll = foldl' combine [] combine :: [Inline] -> Inline -> [Inline] combine f i = diff --git a/src/Text/Pandoc/Writers/Texinfo.hs b/src/Text/Pandoc/Writers/Texinfo.hs index 53da70f84..9d695563f 100644 --- a/src/Text/Pandoc/Writers/Texinfo.hs +++ b/src/Text/Pandoc/Writers/Texinfo.hs @@ -15,7 +15,7 @@ module Text.Pandoc.Writers.Texinfo ( writeTexinfo ) where import Control.Monad.Except (throwError) import Control.Monad.State.Strict import Data.Char (chr, ord) -import Data.List (maximumBy, transpose) +import Data.List (maximumBy, transpose, foldl') import Data.Ord (comparing) import qualified Data.Set as Set import Data.Text (Text) @@ -271,7 +271,7 @@ tableAnyRowToTexinfo :: PandocMonad m -> [[Block]] -> TI m (Doc Text) tableAnyRowToTexinfo itemtype aligns cols = - (literal itemtype $$) . foldl (\row item -> row $$ + (literal itemtype $$) . foldl' (\row item -> row $$ (if isEmpty row then empty else text " @tab ") <> item) empty <$> zipWithM alignedBlock aligns cols alignedBlock :: PandocMonad m -- cgit v1.2.3 From b6a65445e10d74dfe384763c37438338bd395372 Mon Sep 17 00:00:00 2001 From: mbrackeantidot <65160241+mbrackeantidot@users.noreply.github.com> Date: Thu, 29 Apr 2021 18:11:44 +0200 Subject: Docx reader: add handling of vml image objects (jgm#4735) (#7257) They represent images, the same way as other images in vml format. --- src/Text/Pandoc/Readers/Docx/Parse.hs | 11 +++++++++-- test/Tests/Readers/Docx.hs | 4 ++++ test/docx/image_vml_as_object.docx | Bin 0 -> 74199 bytes test/docx/image_vml_as_object.native | 2 ++ 4 files changed, 15 insertions(+), 2 deletions(-) create mode 100644 test/docx/image_vml_as_object.docx create mode 100644 test/docx/image_vml_as_object.native (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index f8ed248d7..7325ff300 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -709,7 +709,8 @@ elemToParPart ns element case drawing of Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) Nothing -> throwError WrongElem --- The below is an attempt to deal with images in deprecated vml format. +-- The two cases below are an attempt to deal with images in deprecated vml format. +-- Todo: check out title and attr for deprecated format. elemToParPart ns element | isElem ns "w" "r" element , Just _ <- findChildByName ns "w" "pict" element = @@ -717,9 +718,15 @@ elemToParPart ns element >>= findAttrByName ns "r" "id" in case drawing of - -- Todo: check out title and attr for deprecated format. Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) Nothing -> throwError WrongElem +elemToParPart ns element + | isElem ns "w" "r" element + , Just objectElem <- findChildByName ns "w" "object" element + , Just shapeElem <- findChildByName ns "v" "shape" objectElem + , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem + , Just drawingId <- findAttrByName ns "r" "id" imagedataElem + = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) -- Chart elemToParPart ns element | isElem ns "w" "r" element diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs index 263e04173..2cce70cc5 100644 --- a/test/Tests/Readers/Docx.hs +++ b/test/Tests/Readers/Docx.hs @@ -155,6 +155,10 @@ tests = [ testGroup "document" "VML image" "docx/image_vml.docx" "docx/image_vml.native" + , testCompare + "VML image as object" + "docx/image_vml_as_object.docx" + "docx/image_vml_as_object.native" , testCompare "inline image in links" "docx/inline_images.docx" diff --git a/test/docx/image_vml_as_object.docx b/test/docx/image_vml_as_object.docx new file mode 100644 index 000000000..7e1e4d2ca Binary files /dev/null and b/test/docx/image_vml_as_object.docx differ diff --git a/test/docx/image_vml_as_object.native b/test/docx/image_vml_as_object.native new file mode 100644 index 000000000..6e689486a --- /dev/null +++ b/test/docx/image_vml_as_object.native @@ -0,0 +1,2 @@ +[Para [Str "Test",Space,Str "with",Space,Str "object",Space,Str "as",Space,Str "image:"] +,Para [Image ("",[],[]) [] ("media/image1.emf","")]] -- cgit v1.2.3 From 105a50569be6d2c1b37cc290abcbfbae1cd0fd1e Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Tue, 25 May 2021 17:49:48 +0200 Subject: Allow compilation with base 4.15 --- src/Text/Pandoc/Options.hs | 34 +++++----- src/Text/Pandoc/Readers/Docx/Combine.hs | 6 +- src/Text/Pandoc/Readers/Odt/ContentReader.hs | 11 ++-- src/Text/Pandoc/Writers/OpenDocument.hs | 98 +++++++++++++--------------- 4 files changed, 72 insertions(+), 77 deletions(-) (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs index 92bda36b2..85d9aa103 100644 --- a/src/Text/Pandoc/Options.hs +++ b/src/Text/Pandoc/Options.hs @@ -315,6 +315,23 @@ defaultMathJaxURL = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.j defaultKaTeXURL :: Text defaultKaTeXURL = "https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.11.1/" +-- Update documentation in doc/filters.md if this is changed. +$(deriveJSON defaultOptions{ constructorTagModifier = + camelCaseStrToHyphenated + } ''TrackChanges) + +$(deriveJSON defaultOptions{ constructorTagModifier = + camelCaseStrToHyphenated + } ''WrapOption) + +$(deriveJSON defaultOptions{ constructorTagModifier = + camelCaseStrToHyphenated . drop 8 + } ''TopLevelDivision) + +$(deriveJSON defaultOptions{ constructorTagModifier = + camelCaseStrToHyphenated + } ''ReferenceLocation) + -- Update documentation in doc/filters.md if this is changed. $(deriveJSON defaultOptions ''ReaderOptions) @@ -338,20 +355,3 @@ $(deriveJSON defaultOptions{ constructorTagModifier = } ''ObfuscationMethod) $(deriveJSON defaultOptions ''HTMLSlideVariant) - --- Update documentation in doc/filters.md if this is changed. -$(deriveJSON defaultOptions{ constructorTagModifier = - camelCaseStrToHyphenated - } ''TrackChanges) - -$(deriveJSON defaultOptions{ constructorTagModifier = - camelCaseStrToHyphenated - } ''WrapOption) - -$(deriveJSON defaultOptions{ constructorTagModifier = - camelCaseStrToHyphenated . drop 8 - } ''TopLevelDivision) - -$(deriveJSON defaultOptions{ constructorTagModifier = - camelCaseStrToHyphenated - } ''ReferenceLocation) diff --git a/src/Text/Pandoc/Readers/Docx/Combine.hs b/src/Text/Pandoc/Readers/Docx/Combine.hs index 7c6d01769..6e4faa639 100644 --- a/src/Text/Pandoc/Readers/Docx/Combine.hs +++ b/src/Text/Pandoc/Readers/Docx/Combine.hs @@ -61,7 +61,7 @@ import Data.List import Data.Bifunctor import Data.Sequence ( ViewL (..), ViewR (..), viewl, viewr, spanr, spanl , (><), (|>) ) -import Text.Pandoc.Builder +import Text.Pandoc.Builder as B data Modifier a = Modifier (a -> a) | AttrModifier (Attr -> a -> a) Attr @@ -116,12 +116,12 @@ ilModifierAndInnards ils = case viewl $ unMany ils of inlinesL :: Inlines -> (Inlines, Inlines) inlinesL ils = case viewl $ unMany ils of - (s :< sq) -> (singleton s, Many sq) + (s :< sq) -> (B.singleton s, Many sq) _ -> (mempty, ils) inlinesR :: Inlines -> (Inlines, Inlines) inlinesR ils = case viewr $ unMany ils of - (sq :> s) -> (Many sq, singleton s) + (sq :> s) -> (Many sq, B.singleton s) _ -> (ils, mempty) combineInlines :: Inlines -> Inlines -> Inlines diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs index c4220b0db..5520d039f 100644 --- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -1,4 +1,5 @@ {-# LANGUAGE Arrows #-} +{-# LANGUAGE CPP #-} {-# LANGUAGE DeriveFoldable #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE PatternGuards #-} @@ -33,7 +34,7 @@ import Data.List (find) import qualified Data.Map as M import qualified Data.Text as T import Data.Maybe -import Data.Semigroup (First(..), Option(..)) +import Data.Monoid (Alt (..)) import Text.TeXMath (readMathML, writeTeX) import qualified Text.Pandoc.XML.Light as XML @@ -505,13 +506,11 @@ type InlineMatcher = ElementMatcher Inlines type BlockMatcher = ElementMatcher Blocks - -newtype FirstMatch a = FirstMatch (Option (First a)) - deriving (Foldable, Monoid, Semigroup) +newtype FirstMatch a = FirstMatch (Alt Maybe a) + deriving (Foldable, Monoid, Semigroup) firstMatch :: a -> FirstMatch a -firstMatch = FirstMatch . Option . Just . First - +firstMatch = FirstMatch . Alt . Just -- matchingElement :: (Monoid e) diff --git a/src/Text/Pandoc/Writers/OpenDocument.hs b/src/Text/Pandoc/Writers/OpenDocument.hs index 34a3a4aa5..5f3224c2f 100644 --- a/src/Text/Pandoc/Writers/OpenDocument.hs +++ b/src/Text/Pandoc/Writers/OpenDocument.hs @@ -1,4 +1,5 @@ {-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE ViewPatterns #-} @@ -372,36 +373,32 @@ blocksToOpenDocument o b = vcat <$> mapM (blockToOpenDocument o) b -- | Convert a Pandoc block element to OpenDocument. blockToOpenDocument :: PandocMonad m => WriterOptions -> Block -> OD m (Doc Text) -blockToOpenDocument o bs - | Plain b <- bs = if null b - then return empty - else inParagraphTags =<< inlinesToOpenDocument o b - | Para [Image attr c (s,T.stripPrefix "fig:" -> Just t)] <- bs - = figure attr c s t - | Para b <- bs = if null b && - not (isEnabled Ext_empty_paragraphs o) - then return empty - else inParagraphTags =<< inlinesToOpenDocument o b - | LineBlock b <- bs = blockToOpenDocument o $ linesToPara b - | Div attr xs <- bs = mkDiv attr xs - | Header i (ident,_,_) b - <- bs = setFirstPara >> (inHeaderTags i ident - =<< inlinesToOpenDocument o b) - | BlockQuote b <- bs = setFirstPara >> mkBlockQuote b - | DefinitionList b <- bs = setFirstPara >> defList b - | BulletList b <- bs = setFirstPara >> bulletListToOpenDocument o b - | OrderedList a b <- bs = setFirstPara >> orderedList a b - | CodeBlock _ s <- bs = setFirstPara >> preformatted s - | Table a bc s th tb tf <- bs = setFirstPara >> table (Ann.toTable a bc s th tb tf) - | HorizontalRule <- bs = setFirstPara >> return (selfClosingTag "text:p" - [ ("text:style-name", "Horizontal_20_Line") ]) - | RawBlock f s <- bs = if f == Format "opendocument" - then return $ text $ T.unpack s - else do - report $ BlockNotRendered bs - return empty - | Null <- bs = return empty - | otherwise = return empty +blockToOpenDocument o = \case + Plain b -> if null b + then return empty + else inParagraphTags =<< inlinesToOpenDocument o b + Para [Image attr c (s,T.stripPrefix "fig:" -> Just t)] -> figure attr c s t + Para b -> if null b && + not (isEnabled Ext_empty_paragraphs o) + then return empty + else inParagraphTags =<< inlinesToOpenDocument o b + LineBlock b -> blockToOpenDocument o $ linesToPara b + Div attr xs -> mkDiv attr xs + Header i (ident,_,_) b -> do + setFirstPara + inHeaderTags i ident =<< inlinesToOpenDocument o b + BlockQuote b -> setFirstPara >> mkBlockQuote b + DefinitionList b -> setFirstPara >> defList b + BulletList b -> setFirstPara >> bulletListToOpenDocument o b + OrderedList a b -> setFirstPara >> orderedList a b + CodeBlock _ s -> setFirstPara >> preformatted s + Table a bc s th tb tf -> setFirstPara >> table (Ann.toTable a bc s th tb tf) + HorizontalRule -> setFirstPara >> return (selfClosingTag "text:p" + [ ("text:style-name", "Horizontal_20_Line") ]) + b@(RawBlock f s) -> if f == Format "opendocument" + then return $ text $ T.unpack s + else empty <$ report (BlockNotRendered b) + Null -> return empty where defList b = do setInDefinitionList True r <- vcat <$> mapM (deflistItemToOpenDocument o) b @@ -874,27 +871,26 @@ data TextStyle = Italic textStyleAttr :: Map.Map Text Text -> TextStyle -> Map.Map Text Text -textStyleAttr m s - | Italic <- s = Map.insert "fo:font-style" "italic" . - Map.insert "style:font-style-asian" "italic" . - Map.insert "style:font-style-complex" "italic" $ m - | Bold <- s = Map.insert "fo:font-weight" "bold" . - Map.insert "style:font-weight-asian" "bold" . - Map.insert "style:font-weight-complex" "bold" $ m - | Under <- s = Map.insert "style:text-underline-style" "solid" . - Map.insert "style:text-underline-width" "auto" . - Map.insert "style:text-underline-color" "font-color" $ m - | Strike <- s = Map.insert "style:text-line-through-style" "solid" m - | Sub <- s = Map.insert "style:text-position" "sub 58%" m - | Sup <- s = Map.insert "style:text-position" "super 58%" m - | SmallC <- s = Map.insert "fo:font-variant" "small-caps" m - | Pre <- s = Map.insert "style:font-name" "Courier New" . - Map.insert "style:font-name-asian" "Courier New" . - Map.insert "style:font-name-complex" "Courier New" $ m - | Language lang <- s - = Map.insert "fo:language" (langLanguage lang) . - maybe id (Map.insert "fo:country") (langRegion lang) $ m - | otherwise = m +textStyleAttr m = \case + Italic -> Map.insert "fo:font-style" "italic" . + Map.insert "style:font-style-asian" "italic" . + Map.insert "style:font-style-complex" "italic" $ m + Bold -> Map.insert "fo:font-weight" "bold" . + Map.insert "style:font-weight-asian" "bold" . + Map.insert "style:font-weight-complex" "bold" $ m + Under -> Map.insert "style:text-underline-style" "solid" . + Map.insert "style:text-underline-width" "auto" . + Map.insert "style:text-underline-color" "font-color" $ m + Strike -> Map.insert "style:text-line-through-style" "solid" m + Sub -> Map.insert "style:text-position" "sub 58%" m + Sup -> Map.insert "style:text-position" "super 58%" m + SmallC -> Map.insert "fo:font-variant" "small-caps" m + Pre -> Map.insert "style:font-name" "Courier New" . + Map.insert "style:font-name-asian" "Courier New" . + Map.insert "style:font-name-complex" "Courier New" $ m + Language lang -> + Map.insert "fo:language" (langLanguage lang) . + maybe id (Map.insert "fo:country") (langRegion lang) $ m withLangFromAttr :: PandocMonad m => Attr -> OD m a -> OD m a withLangFromAttr (_,_,kvs) action = -- cgit v1.2.3 From 44484d0dee1bd095240b9faf26f8d1dad8e560ea Mon Sep 17 00:00:00 2001 From: Emily Bourke Date: Sun, 11 Apr 2021 21:42:53 +0100 Subject: Docx reader: Read table column widths. --- src/Text/Pandoc/Readers/Docx.hs | 5 +- src/Text/Pandoc/Readers/Docx/Parse.hs | 2 +- test/Tests/Writers/Docx.hs | 5 ++ test/docx/0_level_headers.native | 4 +- test/docx/golden/table_one_row.docx | Bin 9840 -> 9840 bytes test/docx/golden/table_with_list_cell.docx | Bin 10159 -> 10162 bytes test/docx/golden/tables-default-widths.docx | Bin 0 -> 10200 bytes test/docx/golden/tables.docx | Bin 10200 -> 10202 bytes test/docx/sdt_elements.native | 6 +- test/docx/table_one_row.native | 8 +-- test/docx/table_variable_width.native | 12 ++-- test/docx/table_with_list_cell.native | 6 +- test/docx/tables-default-widths.native | 92 ++++++++++++++++++++++++++++ test/docx/tables.native | 18 +++--- 14 files changed, 128 insertions(+), 30 deletions(-) create mode 100644 test/docx/golden/tables-default-widths.docx create mode 100644 test/docx/tables-default-widths.native (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 22dd54193..375bb7338 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -639,7 +639,7 @@ bodyPartToBlocks (ListItem pPr _ _ _ parparts) = bodyPartToBlocks $ Paragraph pPr' parparts bodyPartToBlocks (Tbl _ _ _ []) = return $ para mempty -bodyPartToBlocks (Tbl cap _ look parts@(r:rs)) = do +bodyPartToBlocks (Tbl cap grid look parts@(r:rs)) = do let cap' = simpleCaption $ plain $ text cap (hdr, rows) = case firstRowFormatting look of True | null rs -> (Nothing, [r]) @@ -669,7 +669,8 @@ bodyPartToBlocks (Tbl cap _ look parts@(r:rs)) = do -- so should be possible. Alignment might be more difficult, -- since there doesn't seem to be a column entity in docx. let alignments = replicate width AlignDefault - widths = replicate width ColWidthDefault + totalWidth = sum grid + widths = (\w -> ColWidth (fromInteger w / fromInteger totalWidth)) <$> grid return $ table cap' (zip alignments widths) diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 7325ff300..978d6ff3a 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -563,7 +563,7 @@ elemToTblGrid :: NameSpaces -> Element -> D TblGrid elemToTblGrid ns element | isElem ns "w" "tblGrid" element = let cols = findChildrenByName ns "w" "gridCol" element in - mapD (\e -> maybeToD (findAttrByName ns "w" "val" e >>= stringToInteger)) + mapD (\e -> maybeToD (findAttrByName ns "w" "w" e >>= stringToInteger)) cols elemToTblGrid _ _ = throwError WrongElem diff --git a/test/Tests/Writers/Docx.hs b/test/Tests/Writers/Docx.hs index 2e0f1e3fb..da25b95e0 100644 --- a/test/Tests/Writers/Docx.hs +++ b/test/Tests/Writers/Docx.hs @@ -111,6 +111,11 @@ tests = [ testGroup "inlines" def "docx/tables.native" "docx/golden/tables.docx" + , docxTest + "tables without explicit column widths" + def + "docx/tables-default-widths.native" + "docx/golden/tables-default-widths.docx" , docxTest "tables with lists in cells" def diff --git a/test/docx/0_level_headers.native b/test/docx/0_level_headers.native index 7f875891e..ed589b029 100644 --- a/test/docx/0_level_headers.native +++ b/test/docx/0_level_headers.native @@ -1,6 +1,6 @@ [Table ("",[],[]) (Caption Nothing []) - [(AlignDefault,ColWidthDefault)] + [(AlignDefault,ColWidth 1.0)] (TableHead ("",[],[]) []) [(TableBody ("",[],[]) (RowHeadColumns 0) @@ -49,4 +49,4 @@ ,Para [Strong [Str "Table",Space,Str "Page"]] ,Para [Strong [Str "No",Space,Str "table",Space,Str "of",Space,Str "figures",Space,Str "entries",Space,Str "found."]] ,Header 1 ("introduction",[],[]) [Str "Introduction"] -,Para [Str "Nothing",Space,Str "to",Space,Str "introduce,",Space,Str "yet."]] \ No newline at end of file +,Para [Str "Nothing",Space,Str "to",Space,Str "introduce,",Space,Str "yet."]] diff --git a/test/docx/golden/table_one_row.docx b/test/docx/golden/table_one_row.docx index f75e567ab..a7a8f2519 100644 Binary files a/test/docx/golden/table_one_row.docx and b/test/docx/golden/table_one_row.docx differ diff --git a/test/docx/golden/table_with_list_cell.docx b/test/docx/golden/table_with_list_cell.docx index a49f70643..1362d4609 100644 Binary files a/test/docx/golden/table_with_list_cell.docx and b/test/docx/golden/table_with_list_cell.docx differ diff --git a/test/docx/golden/tables-default-widths.docx b/test/docx/golden/tables-default-widths.docx new file mode 100644 index 000000000..f24e27516 Binary files /dev/null and b/test/docx/golden/tables-default-widths.docx differ diff --git a/test/docx/golden/tables.docx b/test/docx/golden/tables.docx index f24e27516..9dcbbc9d0 100644 Binary files a/test/docx/golden/tables.docx and b/test/docx/golden/tables.docx differ diff --git a/test/docx/sdt_elements.native b/test/docx/sdt_elements.native index dca82f0a0..a072c0d39 100644 --- a/test/docx/sdt_elements.native +++ b/test/docx/sdt_elements.native @@ -1,8 +1,8 @@ [Table ("",[],[]) (Caption Nothing []) - [(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault)] + [(AlignDefault,ColWidth 0.22069570301081556) + ,(AlignDefault,ColWidth 0.22069570301081556) + ,(AlignDefault,ColWidth 0.5586085939783689)] (TableHead ("",[],[]) []) [(TableBody ("",[],[]) (RowHeadColumns 0) diff --git a/test/docx/table_one_row.native b/test/docx/table_one_row.native index e9188b145..88d5e3af5 100644 --- a/test/docx/table_one_row.native +++ b/test/docx/table_one_row.native @@ -1,8 +1,8 @@ [Table ("",[],[]) (Caption Nothing []) - [(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault)] + [(AlignDefault,ColWidth 0.3333333333333333) + ,(AlignDefault,ColWidth 0.3333333333333333) + ,(AlignDefault,ColWidth 0.3333333333333333)] (TableHead ("",[],[]) []) [(TableBody ("",[],[]) (RowHeadColumns 0) @@ -15,4 +15,4 @@ ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str "Table"]]]])] (TableFoot ("",[],[]) - [])] \ No newline at end of file + [])] diff --git a/test/docx/table_variable_width.native b/test/docx/table_variable_width.native index 229cb83b1..43ac40cca 100644 --- a/test/docx/table_variable_width.native +++ b/test/docx/table_variable_width.native @@ -1,10 +1,10 @@ [Table ("",[],[]) (Caption Nothing []) - [(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault)] + [(AlignDefault,ColWidth 2.0096205237840725e-2) + ,(AlignDefault,ColWidth 1.9882415820416888e-2) + ,(AlignDefault,ColWidth 0.22202030999465527) + ,(AlignDefault,ColWidth 0.4761090326028862) + ,(AlignDefault,ColWidth 1.0689470871191876e-4)] (TableHead ("",[],[]) [Row ("",[],[]) [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) @@ -42,4 +42,4 @@ ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) []]])] (TableFoot ("",[],[]) - [])] \ No newline at end of file + [])] diff --git a/test/docx/table_with_list_cell.native b/test/docx/table_with_list_cell.native index 06d8606da..51a35184b 100644 --- a/test/docx/table_with_list_cell.native +++ b/test/docx/table_with_list_cell.native @@ -1,7 +1,7 @@ [Table ("",[],[]) (Caption Nothing []) - [(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault)] + [(AlignDefault,ColWidth 0.5) + ,(AlignDefault,ColWidth 0.5)] (TableHead ("",[],[]) [Row ("",[],[]) [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) @@ -22,4 +22,4 @@ ,[Para [Str "A"]] ,[Para [Str "Numbered",Space,Str "list."]]]]]])] (TableFoot ("",[],[]) - [])] \ No newline at end of file + [])] diff --git a/test/docx/tables-default-widths.native b/test/docx/tables-default-widths.native new file mode 100644 index 000000000..e541e5a6e --- /dev/null +++ b/test/docx/tables-default-widths.native @@ -0,0 +1,92 @@ +[Header 2 ("a-table-with-and-without-a-header-row",[],[]) [Str "A",Space,Str "table,",Space,Str "with",Space,Str "and",Space,Str "without",Space,Str "a",Space,Str "header",Space,Str "row"] +,Table ("",[],[]) (Caption Nothing + []) + [(AlignDefault,ColWidthDefault) + ,(AlignDefault,ColWidthDefault) + ,(AlignDefault,ColWidthDefault) + ,(AlignDefault,ColWidthDefault)] + (TableHead ("",[],[]) + [Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Name"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Game"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Fame"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Blame"]]]]) + [(TableBody ("",[],[]) (RowHeadColumns 0) + [] + [Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Lebron",Space,Str "James"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Basketball"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Very",Space,Str "High"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Leaving",Space,Str "Cleveland"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Ryan",Space,Str "Braun"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Baseball"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Moderate"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Steroids"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Russell",Space,Str "Wilson"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Football"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "High"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Tacky",Space,Str "uniform"]]]])] + (TableFoot ("",[],[]) + []) +,Table ("",[],[]) (Caption Nothing + []) + [(AlignDefault,ColWidthDefault) + ,(AlignDefault,ColWidthDefault)] + (TableHead ("",[],[]) + []) + [(TableBody ("",[],[]) (RowHeadColumns 0) + [] + [Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Sinple"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Table"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Without"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Header"]]]])] + (TableFoot ("",[],[]) + []) +,Table ("",[],[]) (Caption Nothing + []) + [(AlignDefault,ColWidthDefault) + ,(AlignDefault,ColWidthDefault)] + (TableHead ("",[],[]) + []) + [(TableBody ("",[],[]) (RowHeadColumns 0) + [] + [Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "Simple"] + ,Para [Str "Multiparagraph"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "Table"] + ,Para [Str "Full"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "Of"] + ,Para [Str "Paragraphs"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Para [Str "In",Space,Str "each"] + ,Para [Str "Cell."]]]])] + (TableFoot ("",[],[]) + [])] \ No newline at end of file diff --git a/test/docx/tables.native b/test/docx/tables.native index e541e5a6e..5a89496be 100644 --- a/test/docx/tables.native +++ b/test/docx/tables.native @@ -1,10 +1,10 @@ [Header 2 ("a-table-with-and-without-a-header-row",[],[]) [Str "A",Space,Str "table,",Space,Str "with",Space,Str "and",Space,Str "without",Space,Str "a",Space,Str "header",Space,Str "row"] ,Table ("",[],[]) (Caption Nothing []) - [(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault)] + [(AlignDefault,ColWidth 0.25) + ,(AlignDefault,ColWidth 0.25) + ,(AlignDefault,ColWidth 0.25) + ,(AlignDefault,ColWidth 0.25)] (TableHead ("",[],[]) [Row ("",[],[]) [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) @@ -48,8 +48,8 @@ []) ,Table ("",[],[]) (Caption Nothing []) - [(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault)] + [(AlignDefault,ColWidth 0.5) + ,(AlignDefault,ColWidth 0.5)] (TableHead ("",[],[]) []) [(TableBody ("",[],[]) (RowHeadColumns 0) @@ -68,8 +68,8 @@ []) ,Table ("",[],[]) (Caption Nothing []) - [(AlignDefault,ColWidthDefault) - ,(AlignDefault,ColWidthDefault)] + [(AlignDefault,ColWidth 0.5) + ,(AlignDefault,ColWidth 0.5)] (TableHead ("",[],[]) []) [(TableBody ("",[],[]) (RowHeadColumns 0) @@ -89,4 +89,4 @@ [Para [Str "In",Space,Str "each"] ,Para [Str "Cell."]]]])] (TableFoot ("",[],[]) - [])] \ No newline at end of file + [])] -- cgit v1.2.3 From 56b211120c62a01f8aba1c4512acfe4677d8c7d0 Mon Sep 17 00:00:00 2001 From: Emily Bourke Date: Thu, 18 Jun 2020 09:53:32 +0100 Subject: Docx reader: Support new table features. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Column spans * Row spans - The spec says that if the `val` attribute is ommitted, its value should be assumed to be `continue`, and that its values are restricted to {`restart`, `continue`}. If the value has any other value, I think it seems reasonable to default it to `continue`. It might cause problems if the spec is extended in the future by adding a third possible value, in which case this would probably give incorrect behaviour, and wouldn't error. * Allow multiple header rows * Include table description in simple caption - The table description element is like alt text for a table (along with the table caption element). It seems like we should include this somewhere, but I’m not 100% sure how – I’m pairing it with the simple caption for the moment. (Should it maybe go in the block caption instead?) * Detect table captions - Check for caption paragraph style /and/ either the simple or complex table field. This means the caption detection fails for captions which don’t contain a field, as in an example doc I added as a test. However, I think it’s better to be too conservative: a missed table caption will still show up as a paragraph next to the table, whereas if I incorrectly classify something else as a table caption it could cause havoc by pairing it up with a table it’s not at all related to, or dropping it entirely. * Update tests and add new ones Partially fixes: #6316 --- src/Text/Pandoc/Readers/Docx.hs | 92 ++++++++------ src/Text/Pandoc/Readers/Docx/Parse.hs | 113 +++++++++++++++-- src/Text/Pandoc/Readers/Docx/Util.hs | 7 ++ test/Tests/Readers/Docx.hs | 16 +++ test/docx/sdt_elements.native | 13 +- test/docx/table_captions_no_field.docx | Bin 0 -> 40482 bytes test/docx/table_captions_no_field.native | 34 ++++++ test/docx/table_captions_with_field.docx | Bin 0 -> 41091 bytes test/docx/table_captions_with_field.native | 54 +++++++++ test/docx/table_header_rowspan.docx | Bin 0 -> 15826 bytes test/docx/table_header_rowspan.native | 189 +++++++++++++++++++++++++++++ test/docx/table_one_header_row.docx | Bin 0 -> 12185 bytes test/docx/table_one_header_row.native | 18 +++ test/docx/table_one_row.docx | Bin 25251 -> 12148 bytes test/docx/table_variable_width.native | 19 ++- 15 files changed, 487 insertions(+), 68 deletions(-) create mode 100644 test/docx/table_captions_no_field.docx create mode 100644 test/docx/table_captions_no_field.native create mode 100644 test/docx/table_captions_with_field.docx create mode 100644 test/docx/table_captions_with_field.native create mode 100644 test/docx/table_header_rowspan.docx create mode 100644 test/docx/table_header_rowspan.native create mode 100644 test/docx/table_one_header_row.docx create mode 100644 test/docx/table_one_header_row.native (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 375bb7338..c06adf7e3 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -61,13 +61,14 @@ module Text.Pandoc.Readers.Docx import Codec.Archive.Zip import Control.Monad.Reader import Control.Monad.State.Strict +import Data.Bifunctor (bimap, first) import qualified Data.ByteString.Lazy as B import Data.Default (Default) -import Data.List (delete, intersect) +import Data.List (delete, intersect, foldl') import Data.Char (isSpace) import qualified Data.Map as M import qualified Data.Text as T -import Data.Maybe (isJust, fromMaybe) +import Data.Maybe (catMaybes, isJust, fromMaybe) import Data.Sequence (ViewL (..), viewl) import qualified Data.Sequence as Seq import qualified Data.Set as Set @@ -113,6 +114,7 @@ data DState = DState { docxAnchorMap :: M.Map T.Text T.Text -- restarting , docxListState :: M.Map (T.Text, T.Text) Integer , docxPrevPara :: Inlines + , docxTableCaptions :: [Blocks] } instance Default DState where @@ -123,6 +125,7 @@ instance Default DState where , docxDropCap = mempty , docxListState = M.empty , docxPrevPara = mempty + , docxTableCaptions = [] } data DEnv = DEnv { docxOptions :: ReaderOptions @@ -491,15 +494,32 @@ singleParaToPlain blks singleton $ Plain ils singleParaToPlain blks = blks -cellToBlocks :: PandocMonad m => Docx.Cell -> DocxContext m Blocks -cellToBlocks (Docx.Cell bps) = do +cellToCell :: PandocMonad m => RowSpan -> Docx.Cell -> DocxContext m Pandoc.Cell +cellToCell rowSpan (Docx.Cell gridSpan _ bps) = do blks <- smushBlocks <$> mapM bodyPartToBlocks bps - return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks + let blks' = singleParaToPlain $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks + return (cell AlignDefault rowSpan (ColSpan (fromIntegral gridSpan)) blks') + +rowsToRows :: PandocMonad m => [Docx.Row] -> DocxContext m [Pandoc.Row] +rowsToRows rows = do + let rowspans = (fmap . fmap) (first RowSpan) (Docx.rowsToRowspans rows) + cells <- traverse (traverse (uncurry cellToCell)) rowspans + return (fmap (Pandoc.Row nullAttr) cells) + +splitHeaderRows :: Bool -> [Docx.Row] -> ([Docx.Row], [Docx.Row]) +splitHeaderRows hasFirstRowFormatting rs = bimap reverse reverse $ fst + $ if hasFirstRowFormatting + then foldl' f ((take 1 rs, []), True) (drop 1 rs) + else foldl' f (([], []), False) rs + where + f ((headerRows, bodyRows), previousRowWasHeader) r@(Docx.Row h cs) + | h == HasTblHeader || (previousRowWasHeader && any isContinuationCell cs) + = ((r : headerRows, bodyRows), True) + | otherwise + = ((headerRows, r : bodyRows), False) + + isContinuationCell (Docx.Cell _ vm _) = vm == Docx.Continue -rowToBlocksList :: PandocMonad m => Docx.Row -> DocxContext m [Blocks] -rowToBlocksList (Docx.Row cells) = do - blksList <- mapM cellToBlocks cells - return $ map singleParaToPlain blksList -- like trimInlines, but also take out linebreaks trimSps :: Inlines -> Inlines @@ -546,6 +566,11 @@ normalizeToClassName = T.map go . fromStyleName where go c | isSpace c = '-' | otherwise = c +bodyPartToTableCaption :: PandocMonad m => BodyPart -> DocxContext m (Maybe Blocks) +bodyPartToTableCaption (TblCaption pPr parparts) = + Just <$> bodyPartToBlocks (Paragraph pPr parparts) +bodyPartToTableCaption _ = pure Nothing + bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks bodyPartToBlocks (Paragraph pPr parparts) | Just True <- pBidi pPr = do @@ -637,50 +662,43 @@ bodyPartToBlocks (ListItem pPr _ _ _ parparts) = let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr} in bodyPartToBlocks $ Paragraph pPr' parparts +bodyPartToBlocks (TblCaption _ _) = + return $ para mempty -- collected separately bodyPartToBlocks (Tbl _ _ _ []) = return $ para mempty -bodyPartToBlocks (Tbl cap grid look parts@(r:rs)) = do - let cap' = simpleCaption $ plain $ text cap - (hdr, rows) = case firstRowFormatting look of - True | null rs -> (Nothing, [r]) - | otherwise -> (Just r, rs) - False -> (Nothing, r:rs) - - cells <- mapM rowToBlocksList rows +bodyPartToBlocks (Tbl cap grid look parts) = do + captions <- gets docxTableCaptions + fullCaption <- case captions of + c : cs -> do + modify (\s -> s { docxTableCaptions = cs }) + return c + [] -> return $ if T.null cap then mempty else plain (text cap) + let shortCaption = if T.null cap then Nothing else Just (toList (text cap)) + cap' = caption shortCaption fullCaption + (hdr, rows) = splitHeaderRows (firstRowFormatting look) parts let width = maybe 0 maximum $ nonEmpty $ map rowLength parts rowLength :: Docx.Row -> Int - rowLength (Docx.Row c) = length c + rowLength (Docx.Row _ c) = sum (fmap (\(Docx.Cell gridSpan _ _) -> fromIntegral gridSpan) c) - let toRow = Pandoc.Row nullAttr . map simpleCell - toHeaderRow l = [toRow l | not (null l)] + headerCells <- rowsToRows hdr + bodyCells <- rowsToRows rows - -- pad cells. New Text.Pandoc.Builder will do that for us, - -- so this is for compatibility while we switch over. - let cells' = map (\row -> toRow $ take width (row ++ repeat mempty)) cells - - hdrCells <- case hdr of - Just r' -> toHeaderRow <$> rowToBlocksList r' - Nothing -> return [] - - -- The two following variables (horizontal column alignment and - -- relative column widths) go to the default at the - -- moment. Width information is in the TblGrid field of the Tbl, - -- so should be possible. Alignment might be more difficult, - -- since there doesn't seem to be a column entity in docx. + -- Horizontal column alignment goes to the default at the moment. Getting + -- it might be difficult, since there doesn't seem to be a column entity + -- in docx. let alignments = replicate width AlignDefault totalWidth = sum grid widths = (\w -> ColWidth (fromInteger w / fromInteger totalWidth)) <$> grid return $ table cap' (zip alignments widths) - (TableHead nullAttr hdrCells) - [TableBody nullAttr 0 [] cells'] + (TableHead nullAttr headerCells) + [TableBody nullAttr 0 [] bodyCells] (TableFoot nullAttr []) bodyPartToBlocks (OMathPara e) = return $ para $ displayMath (writeTeX e) - -- replace targets with generated anchors. rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline rewriteLink' l@(Link attr ils (T.uncons -> Just ('#',target), title)) = do @@ -716,6 +734,8 @@ bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block]) bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps + captions <- catMaybes <$> mapM bodyPartToTableCaption blkbps + modify (\s -> s { docxTableCaptions = captions }) blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks blks'' <- removeOrphanAnchors blks' diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 978d6ff3a..aaa8f4ad0 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -33,7 +33,9 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , ParStyle , CharStyle(cStyleData) , Row(..) + , TblHeader(..) , Cell(..) + , VMerge(..) , TrackedChange(..) , ChangeType(..) , ChangeInfo(..) @@ -50,6 +52,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , pHeading , constructBogusParStyleData , leftBiasedMergeRunStyle + , rowsToRowspans ) where import Text.Pandoc.Readers.Docx.Parse.Styles import Codec.Archive.Zip @@ -225,6 +228,7 @@ defaultParagraphStyle = ParagraphStyle { pStyle = [] data BodyPart = Paragraph ParagraphStyle [ParPart] | ListItem ParagraphStyle T.Text T.Text (Maybe Level) [ParPart] | Tbl T.Text TblGrid TblLook [Row] + | TblCaption ParagraphStyle [ParPart] | OMathPara [Exp] deriving Show @@ -236,12 +240,61 @@ newtype TblLook = TblLook {firstRowFormatting::Bool} defaultTblLook :: TblLook defaultTblLook = TblLook{firstRowFormatting = False} -newtype Row = Row [Cell] - deriving Show +data Row = Row TblHeader [Cell] deriving Show + +data TblHeader = HasTblHeader | NoTblHeader deriving (Show, Eq) -newtype Cell = Cell [BodyPart] +data Cell = Cell GridSpan VMerge [BodyPart] deriving Show +type GridSpan = Integer + +data VMerge = Continue + -- ^ This cell should be merged with the one above it + | Restart + -- ^ This cell should not be merged with the one above it + deriving (Show, Eq) + +rowsToRowspans :: [Row] -> [[(Int, Cell)]] +rowsToRowspans rows = let + removeMergedCells = fmap (filter (\(_, Cell _ vmerge _) -> vmerge == Restart)) + in removeMergedCells (foldr f [] rows) + where + f :: Row -> [[(Int, Cell)]] -> [[(Int, Cell)]] + f (Row _ cells) acc = let + spans = g cells Nothing (listToMaybe acc) + in spans : acc + + g :: + -- | The current row + [Cell] -> + -- | Number of columns left below + Maybe Integer -> + -- | (rowspan so far, cell) for the row below this one + Maybe [(Int, Cell)] -> + -- | (rowspan so far, cell) for this row + [(Int, Cell)] + g cells _ Nothing = zip (repeat 1) cells + g cells columnsLeftBelow (Just rowBelow) = + case cells of + [] -> [] + thisCell@(Cell thisGridSpan _ _) : restOfRow -> case rowBelow of + [] -> zip (repeat 1) cells + (spanSoFarBelow, Cell gridSpanBelow vmerge _) : _ -> + let spanSoFar = case vmerge of + Restart -> 1 + Continue -> 1 + spanSoFarBelow + columnsToDrop = thisGridSpan + (gridSpanBelow - fromMaybe gridSpanBelow columnsLeftBelow) + (newColumnsLeftBelow, restOfRowBelow) = dropColumns columnsToDrop rowBelow + in (spanSoFar, thisCell) : g restOfRow (Just newColumnsLeftBelow) (Just restOfRowBelow) + + dropColumns :: Integer -> [(a, Cell)] -> (Integer, [(a, Cell)]) + dropColumns n [] = (n, []) + dropColumns n cells@((_, Cell gridSpan _ _) : otherCells) = + if n < gridSpan + then (gridSpan - n, cells) + else dropColumns (n - gridSpan) otherCells + leftBiasedMergeRunStyle :: RunStyle -> RunStyle -> RunStyle leftBiasedMergeRunStyle a b = RunStyle { isBold = isBold a <|> isBold b @@ -587,14 +640,31 @@ elemToRow ns element | isElem ns "w" "tr" element = do let cellElems = findChildrenByName ns "w" "tc" element cells <- mapD (elemToCell ns) cellElems - return $ Row cells + let hasTblHeader = maybe NoTblHeader (const HasTblHeader) + (findChildByName ns "w" "trPr" element + >>= findChildByName ns "w" "tblHeader") + return $ Row hasTblHeader cells elemToRow _ _ = throwError WrongElem elemToCell :: NameSpaces -> Element -> D Cell elemToCell ns element | isElem ns "w" "tc" element = do + let properties = findChildByName ns "w" "tcPr" element + let gridSpan = properties + >>= findChildByName ns "w" "gridSpan" + >>= findAttrByName ns "w" "val" + >>= stringToInteger + let vMerge = case properties >>= findChildByName ns "w" "vMerge" of + Nothing -> Restart + Just e -> + fromMaybe Continue $ do + s <- findAttrByName ns "w" "val" e + case s of + "continue" -> Just Continue + "restart" -> Just Restart + _ -> Nothing cellContents <- mapD (elemToBodyPart ns) (elChildren element) - return $ Cell cellContents + return $ Cell (fromMaybe 1 gridSpan) vMerge cellContents elemToCell _ _ = throwError WrongElem elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation @@ -626,10 +696,9 @@ pNumInfo = getParStyleField numInfo . pStyle elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "w" "p" element - , (c:_) <- findChildrenByName ns "m" "oMathPara" element = - do - expsLst <- eitherToD $ readOMML $ showElement c - return $ OMathPara expsLst + , (c:_) <- findChildrenByName ns "m" "oMathPara" element = do + expsLst <- eitherToD $ readOMML $ showElement c + return $ OMathPara expsLst elemToBodyPart ns element | isElem ns "w" "p" element , Just (numId, lvl) <- getNumInfo ns element = do @@ -647,13 +716,31 @@ elemToBodyPart ns element Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do levelInfo <- lookupLevel numId lvl <$> asks envNumbering return $ ListItem parstyle numId lvl levelInfo parparts - _ -> return $ Paragraph parstyle parparts + _ -> let + hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle) + + hasSimpleTableField = fromMaybe False $ do + fldSimple <- findChildByName ns "w" "fldSimple" element + instr <- findAttrByName ns "w" "instr" fldSimple + pure ("Table" `elem` T.words instr) + + hasComplexTableField = fromMaybe False $ do + instrText <- findElementByName ns "w" "instrText" element + pure ("Table" `elem` T.words (strContent instrText)) + + in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField) + then return $ TblCaption parstyle parparts + else return $ Paragraph parstyle parparts + elemToBodyPart ns element | isElem ns "w" "tbl" element = do - let caption' = findChildByName ns "w" "tblPr" element + let tblProperties = findChildByName ns "w" "tblPr" element + caption = fromMaybe "" $ tblProperties >>= findChildByName ns "w" "tblCaption" >>= findAttrByName ns "w" "val" - caption = fromMaybe "" caption' + description = fromMaybe "" $ tblProperties + >>= findChildByName ns "w" "tblDescription" + >>= findAttrByName ns "w" "val" grid' = case findChildByName ns "w" "tblGrid" element of Just g -> elemToTblGrid ns g Nothing -> return [] @@ -666,7 +753,7 @@ elemToBodyPart ns element grid <- grid' tblLook <- tblLook' rows <- mapD (elemToRow ns) (elChildren element) - return $ Tbl caption grid tblLook rows + return $ Tbl (caption <> description) grid tblLook rows elemToBodyPart _ _ = throwError WrongElem lookupRelationship :: DocumentLocation -> RelId -> [Relationship] -> Maybe Target diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index ac331cba6..970697a2d 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -19,6 +19,7 @@ module Text.Pandoc.Readers.Docx.Util ( , elemToNameSpaces , findChildByName , findChildrenByName + , findElementByName , findAttrByName ) where @@ -56,6 +57,12 @@ findChildrenByName ns pref name el = let ns' = ns <> elemToNameSpaces el in findChildren (elemName ns' pref name) el +-- | Like 'findChildrenByName', but searches descendants. +findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element +findElementByName ns pref name el = + let ns' = ns <> elemToNameSpaces el + in findElement (elemName ns' pref name) el + findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text findAttrByName ns pref name el = let ns' = ns <> elemToNameSpaces el diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs index 939ff9939..220c7d9c5 100644 --- a/test/Tests/Readers/Docx.hs +++ b/test/Tests/Readers/Docx.hs @@ -317,14 +317,30 @@ tests = [ testGroup "document" "tables with lists in cells" "docx/table_with_list_cell.docx" "docx/table_with_list_cell.native" + , testCompare + "a table with a header which contains rowspans greater than 1" + "docx/table_header_rowspan.docx" + "docx/table_header_rowspan.native" , testCompare "tables with one row" "docx/table_one_row.docx" "docx/table_one_row.native" + , testCompare + "tables with just one row, which is a header" + "docx/table_one_header_row.docx" + "docx/table_one_header_row.native" , testCompare "tables with variable width" "docx/table_variable_width.docx" "docx/table_variable_width.native" + , testCompare + "tables with captions which contain a Table field" + "docx/table_captions_with_field.docx" + "docx/table_captions_with_field.native" + , testCompare + "tables with captions which don't contain a Table field" + "docx/table_captions_no_field.docx" + "docx/table_captions_no_field.native" , testCompare "code block" "docx/codeblock.docx" diff --git a/test/docx/sdt_elements.native b/test/docx/sdt_elements.native index a072c0d39..d2aa00994 100644 --- a/test/docx/sdt_elements.native +++ b/test/docx/sdt_elements.native @@ -4,17 +4,16 @@ ,(AlignDefault,ColWidth 0.22069570301081556) ,(AlignDefault,ColWidth 0.5586085939783689)] (TableHead ("",[],[]) - []) - [(TableBody ("",[],[]) (RowHeadColumns 0) - [] - [Row ("",[],[]) + [Row ("",[],[]) [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Strong [Str "col1Header"]]] ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Strong [Str "col2Header"]]] ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) - [Plain [Strong [Str "col3Header"]]]] - ,Row ("",[],[]) + [Plain [Strong [Str "col3Header"]]]]]) + [(TableBody ("",[],[]) (RowHeadColumns 0) + [] + [Row ("",[],[]) [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str "col1",Space,Str "content"]] ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) @@ -22,4 +21,4 @@ ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str "col3",Space,Str "content"]]]])] (TableFoot ("",[],[]) - [])] \ No newline at end of file + [])] diff --git a/test/docx/table_captions_no_field.docx b/test/docx/table_captions_no_field.docx new file mode 100644 index 000000000..1687d32a2 Binary files /dev/null and b/test/docx/table_captions_no_field.docx differ diff --git a/test/docx/table_captions_no_field.native b/test/docx/table_captions_no_field.native new file mode 100644 index 000000000..b8f54d541 --- /dev/null +++ b/test/docx/table_captions_no_field.native @@ -0,0 +1,34 @@ +[Para [Str "See",Space,Str "Table",Space,Str "5.1."] +,Para [Str "Table",Space,Str "5.1"] +,Table ("",[],[]) (Caption Nothing + []) + [(AlignDefault,ColWidth 0.7605739372523825) + ,(AlignDefault,ColWidth 0.11971303137380876) + ,(AlignDefault,ColWidth 0.11971303137380876)] + (TableHead ("",[],[]) +[Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Count"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "%"]]]]) + [(TableBody ("",[],[]) (RowHeadColumns 0) + [] + [Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "First",Space,Str "option"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "242"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "45"]]] +,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Second",Space,Str "option"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "99"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "18"]]]])] + (TableFoot ("",[],[]) + []) +,Header 2 ("section", [], []) []] diff --git a/test/docx/table_captions_with_field.docx b/test/docx/table_captions_with_field.docx new file mode 100644 index 000000000..db6de3088 Binary files /dev/null and b/test/docx/table_captions_with_field.docx differ diff --git a/test/docx/table_captions_with_field.native b/test/docx/table_captions_with_field.native new file mode 100644 index 000000000..deb8afc6b --- /dev/null +++ b/test/docx/table_captions_with_field.native @@ -0,0 +1,54 @@ +[Para [Str "See",Space,Str "Table",Space,Str "1."] +,Para [] +,Table ("",[],[]) (Caption Nothing + [Para [Str "Table",Space,Str "1"]]) + [(AlignDefault,ColWidth 0.7605739372523825) + ,(AlignDefault,ColWidth 0.11971303137380876) + ,(AlignDefault,ColWidth 0.11971303137380876)] + (TableHead ("",[],[]) +[Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Count"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "%"]]]]) + [(TableBody ("",[],[]) (RowHeadColumns 0) + [] + [Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "First",Space,Str "option"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "242"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "45"]]] +,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Second",Space,Str "option"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "99"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "18"]]]])] + (TableFoot ("",[],[]) + []) +,Header 2 ("section", [], []) [] +,Table ("",[],[]) (Caption Nothing + [Para [Str "Table",Space,Str "2"]]) + [(AlignDefault,ColWidth 0.3332963620230701) + ,(AlignDefault,ColWidth 0.3332963620230701) + ,(AlignDefault,ColWidth 0.3334072759538598)] + (TableHead ("",[],[]) + [Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "One"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Two"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Three"]]]]) + [(TableBody ("",[],[]) (RowHeadColumns 0) + [] + [])] + (TableFoot ("",[],[]) + []) +,Para [] +,Para [Str "See",Space,Str "Table",Space,Str "2."]] diff --git a/test/docx/table_header_rowspan.docx b/test/docx/table_header_rowspan.docx new file mode 100644 index 000000000..1cc32a105 Binary files /dev/null and b/test/docx/table_header_rowspan.docx differ diff --git a/test/docx/table_header_rowspan.native b/test/docx/table_header_rowspan.native new file mode 100644 index 000000000..d951f29e4 --- /dev/null +++ b/test/docx/table_header_rowspan.native @@ -0,0 +1,189 @@ +[Table ("",[],[]) (Caption Nothing + []) + [(AlignDefault,ColWidth 0.30701754385964913) + ,(AlignDefault,ColWidth 0.1364522417153996) + ,(AlignDefault,ColWidth 0.10009746588693957) + ,(AlignDefault,ColWidth 9.707602339181287e-2) + ,(AlignDefault,ColWidth 7.719298245614035e-2) + ,(AlignDefault,ColWidth 7.085769980506823e-2) + ,(AlignDefault,ColWidth 7.09551656920078e-2) + ,(AlignDefault,ColWidth 0.14035087719298245)] + (TableHead ("",[],[]) +[Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 2) (ColSpan 1) + [Plain [Str "A"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 2) (ColSpan 1) + [Plain [Strong [Str "B"]]] + ,Cell ("",[],[]) AlignDefault (RowSpan 2) (ColSpan 1) + [Plain [Strong [Str "C"]]] + ,Cell ("",[],[]) AlignDefault (RowSpan 2) (ColSpan 1) + [Plain [Strong [Str "D"]]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 3) + [Plain [Str "E"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 2) (ColSpan 1) + [Plain [Str "F"]]] +,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Strong [Str "G"]]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Strong [Str "H"]]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Strong [Str "I"]]]]]) + [(TableBody ("",[],[]) (RowHeadColumns 0) + [] + [Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "1"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "2"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "3"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "4"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "5"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "6"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "7"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "8"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "1"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "2"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "3"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "4"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "5"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "6"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "7"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "8"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "1"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "2"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "3"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "4"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "5"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "6"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "7"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "8"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "1"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "2"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "3"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "4"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "5"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "6"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "7"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "8"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "1"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "2"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "3"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "4"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "5"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "6"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "7"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "8"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "1"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "2"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "3"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "4"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "5"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "6"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "7"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "8"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "1"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "2"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "3"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "4"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "5"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "6"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "7"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "8"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "1"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "2"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "3"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "4"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "5"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "6"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "7"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "8"]]] + ,Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "1"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "2"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "3"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "4"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "5"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "6"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "7"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "8"]]] + ])] + (TableFoot ("",[],[]) + [])] diff --git a/test/docx/table_one_header_row.docx b/test/docx/table_one_header_row.docx new file mode 100644 index 000000000..db715dda8 Binary files /dev/null and b/test/docx/table_one_header_row.docx differ diff --git a/test/docx/table_one_header_row.native b/test/docx/table_one_header_row.native new file mode 100644 index 000000000..4aae830ac --- /dev/null +++ b/test/docx/table_one_header_row.native @@ -0,0 +1,18 @@ +[Table ("",[],[]) (Caption Nothing + []) + [(AlignDefault,ColWidth 0.33302433371958284) + ,(AlignDefault,ColWidth 0.3332560834298957) + ,(AlignDefault,ColWidth 0.33371958285052145)] + (TableHead ("",[],[]) + [Row ("",[],[]) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "One"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Row"]] + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Plain [Str "Table"]]]]) + [(TableBody ("",[],[]) (RowHeadColumns 0) + [] + [])] + (TableFoot ("",[],[]) + [])] diff --git a/test/docx/table_one_row.docx b/test/docx/table_one_row.docx index f7e0ebe43..d05a856b5 100644 Binary files a/test/docx/table_one_row.docx and b/test/docx/table_one_row.docx differ diff --git a/test/docx/table_variable_width.native b/test/docx/table_variable_width.native index 43ac40cca..ff1cc0dc4 100644 --- a/test/docx/table_variable_width.native +++ b/test/docx/table_variable_width.native @@ -4,7 +4,8 @@ ,(AlignDefault,ColWidth 1.9882415820416888e-2) ,(AlignDefault,ColWidth 0.22202030999465527) ,(AlignDefault,ColWidth 0.4761090326028862) - ,(AlignDefault,ColWidth 1.0689470871191876e-4)] + ,(AlignDefault,ColWidth 1.0689470871191876e-4) + ,(AlignDefault,ColWidth 0.26178514163548905)] (TableHead ("",[],[]) [Row ("",[],[]) [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) @@ -13,33 +14,27 @@ [] ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str "h3"]] - ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 2) [Plain [Str "h4"]] ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str "h5"]]]]) [(TableBody ("",[],[]) (RowHeadColumns 0) [] [Row ("",[],[]) - [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 3) [Plain [Str "c11"]] ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) [] - ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) - [] - ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) - [] - ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 2) []] ,Row ("",[],[]) [Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) [] - ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 2) [Plain [Str "c22"]] ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str "c23"]] - ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) - [] - ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 1) + ,Cell ("",[],[]) AlignDefault (RowSpan 1) (ColSpan 2) []]])] (TableFoot ("",[],[]) [])] -- cgit v1.2.3 From cfa26e3ca0346397f41af9aed5b4cd1d86be1220 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 12 Jun 2021 13:56:09 -0700 Subject: Docx reader: handle absolute URIs in Relationship Target. Closes #7374. --- src/Text/Pandoc/Readers/Docx/Parse.hs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'src/Text/Pandoc/Readers/Docx') diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index aaa8f4ad0..dbb16a821 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -479,20 +479,26 @@ filePathToRelType path docXmlPath = then Just InDocument else Nothing -relElemToRelationship :: DocumentLocation -> Element -> Maybe Relationship -relElemToRelationship relType element | qName (elName element) == "Relationship" = +relElemToRelationship :: FilePath -> DocumentLocation -> Element + -> Maybe Relationship +relElemToRelationship fp relType element | qName (elName element) == "Relationship" = do relId <- findAttr (QName "Id" Nothing Nothing) element target <- findAttr (QName "Target" Nothing Nothing) element - return $ Relationship relType relId target -relElemToRelationship _ _ = Nothing + -- target may be relative (media/image1.jpeg) or absolute + -- (/word/media/image1.jpeg); we need to relativize it (see #7374) + let frontOfFp = T.pack $ takeWhile (/= '_') fp + let target' = fromMaybe target $ + T.stripPrefix frontOfFp $ T.dropWhile (== '/') target + return $ Relationship relType relId target' +relElemToRelationship _ _ _ = Nothing filePathToRelationships :: Archive -> FilePath -> FilePath -> [Relationship] filePathToRelationships ar docXmlPath fp | Just relType <- filePathToRelType fp docXmlPath , Just entry <- findEntryByPath fp ar , Just relElems <- parseXMLFromEntry entry = - mapMaybe (relElemToRelationship relType) $ elChildren relElems + mapMaybe (relElemToRelationship fp relType) $ elChildren relElems filePathToRelationships _ _ _ = [] archiveToRelationships :: Archive -> FilePath -> [Relationship] -- cgit v1.2.3