From 8c38390038edcebd55f9dec8359ef983f3813425 Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Fri, 27 Nov 2020 21:21:25 +0100 Subject: HTML reader tests: improve test coverage of new features --- src/Text/Pandoc/Readers/HTML/Table.hs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs index 5a783988f..91639fa4c 100644 --- a/src/Text/Pandoc/Readers/HTML/Table.hs +++ b/src/Text/Pandoc/Readers/HTML/Table.hs @@ -214,7 +214,8 @@ normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot -> Either String ([ColSpec], TableHead, [TableBody], TableFoot) normalize widths head' bodies foot = do let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot - let rowLength = length . rowCells + let cellWidth (Cell _ _ _ (ColSpan cs) _) = cs + let rowLength = foldr (\cell acc -> cellWidth cell + acc) 0 . rowCells let ncols = maximum (map rowLength rows) let tblType = tableType (map rowCells rows) -- fail on empty table -- cgit v1.2.3 From 83d63b72e1b9eff9f2aa3b9f36b56d348f0909a2 Mon Sep 17 00:00:00 2001 From: Tassos Manganaris Date: Fri, 27 Nov 2020 23:42:53 -0500 Subject: Fix a tiny Typo in the CSV reader module Header comment in the CSV reader module says "RST" instead of "CSV". --- src/Text/Pandoc/Readers/CSV.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/CSV.hs b/src/Text/Pandoc/Readers/CSV.hs index f0edcaa16..45f4d88d4 100644 --- a/src/Text/Pandoc/Readers/CSV.hs +++ b/src/Text/Pandoc/Readers/CSV.hs @@ -2,7 +2,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | - Module : Text.Pandoc.Readers.RST + Module : Text.Pandoc.Readers.CSV Copyright : Copyright (C) 2006-2020 John MacFarlane License : GNU GPL, version 2 or above -- cgit v1.2.3 From bff9c129c3579f928a0067759d0a784eb5c07d30 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 29 Nov 2020 10:29:51 -0800 Subject: LaTeX reader: don't parse `\rule` with width 0 as horizontal rule. --- src/Text/Pandoc/Readers/LaTeX.hs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 2a9bff746..15a1a19fc 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1225,6 +1225,16 @@ preamble = mconcat <$> many preambleBlock anyTok return mempty) +rule :: PandocMonad m => LP m Blocks +rule = do + skipopts + width <- T.takeWhile (\c -> isDigit c || c == '.') . stringify <$> tok + _thickness <- tok + -- 0-width rules are used to fix spacing issues: + case safeRead width of + Just (0 :: Double) -> return mempty + _ -> return horizontalRule + paragraph :: PandocMonad m => LP m Blocks paragraph = do x <- trimInlines . mconcat <$> many1 inline @@ -1595,7 +1605,7 @@ blockCommands = M.fromList -- , ("hrule", pure horizontalRule) , ("strut", pure mempty) - , ("rule", skipopts *> tok *> tok $> horizontalRule) + , ("rule", rule) , ("item", looseItem) , ("documentclass", skipopts *> braced *> preamble) , ("centerline", para . trimInlines <$> (skipopts *> tok)) -- cgit v1.2.3 From 7b11cdee49ec491a33dcade6ea68c3084a425992 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 2 Dec 2020 10:46:23 -0800 Subject: Citeproc: ensure that BCP47 lang codes can be used. We ignore the variants and just use the base lang code and country code when passing off to citeproc. --- src/Text/Pandoc/Citeproc.hs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Citeproc.hs b/src/Text/Pandoc/Citeproc.hs index a48f97c3b..0de2882ae 100644 --- a/src/Text/Pandoc/Citeproc.hs +++ b/src/Text/Pandoc/Citeproc.hs @@ -16,6 +16,7 @@ import Text.Pandoc.Citeproc.BibTeX (readBibtexString, Variant(..)) import Text.Pandoc.Citeproc.MetaValue (metaValueToReference, metaValueToText) import Text.Pandoc.Readers.Markdown (yamlToRefs) import Text.Pandoc.Class (setResourcePath, getResourcePath, getUserDataDir) +import qualified Text.Pandoc.BCP47 as BCP47 import Data.ByteString (ByteString) import qualified Data.ByteString.Lazy as L import Text.Pandoc.Definition as Pandoc @@ -91,8 +92,9 @@ processCitations (Pandoc meta bs) = do case styleRes of Left err -> throwError $ PandocAppError $ prettyCiteprocError err Right style -> return style{ styleAbbreviations = mbAbbrevs } - let mblang = parseLang <$> - ((lookupMeta "lang" meta <|> lookupMeta "locale" meta) >>= metaValueToText) + mblang <- maybe (return Nothing) bcp47LangToIETF + ((lookupMeta "lang" meta <|> lookupMeta "locale" meta) >>= + metaValueToText) let locale = Citeproc.mergeLocales mblang style let getCiteId (Cite cs _) = Set.fromList $ map B.citationId cs getCiteId _ = mempty @@ -579,3 +581,16 @@ removeFinalPeriod ils = isRightQuote "\8217" = True isRightQuote "\187" = True isRightQuote _ = False + +bcp47LangToIETF :: PandocMonad m => Text -> m (Maybe Lang) +bcp47LangToIETF bcplang = + case BCP47.parseBCP47 bcplang of + Left _ -> do + report $ InvalidLang bcplang + return Nothing + Right lang -> + return $ Just + $ Lang (BCP47.langLanguage lang) + (if T.null (BCP47.langRegion lang) + then Nothing + else Just (BCP47.langRegion lang)) -- cgit v1.2.3 From 5bbd5a9e80c4f4356d71b54315cf75b5ae2f3650 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 3 Dec 2020 09:51:23 -0800 Subject: Docx writer: Support bold and italic in "complex script." Previously bold and italics didn't work properly in LTR text. This commit causes the w:bCs and w:iCs attributes to be used, in addition to w:b and w:i, for bold and italics respectively. Closes #6911. --- src/Text/Pandoc/Writers/Docx.hs | 8 ++++++-- test/docx/golden/custom_style_preserve.docx | Bin 10573 -> 10578 bytes test/docx/golden/definition_list.docx | Bin 9844 -> 9850 bytes test/docx/golden/inline_formatting.docx | Bin 9959 -> 9969 bytes 4 files changed, 6 insertions(+), 2 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs index a380fd4fa..4cb879e6a 100644 --- a/src/Text/Pandoc/Writers/Docx.hs +++ b/src/Text/Pandoc/Writers/Docx.hs @@ -1272,9 +1272,13 @@ inlineToOpenXML' opts (Span (ident,classes,kvs) ils) = do $ inlinesToOpenXML opts ils wrapBookmark ident contents inlineToOpenXML' opts (Strong lst) = - withTextProp (mknode "w:b" [] ()) $ inlinesToOpenXML opts lst + withTextProp (mknode "w:b" [] ()) $ + withTextProp (mknode "w:bCs" [] ()) $ -- needed for LTR, #6911 + inlinesToOpenXML opts lst inlineToOpenXML' opts (Emph lst) = - withTextProp (mknode "w:i" [] ()) $ inlinesToOpenXML opts lst + withTextProp (mknode "w:i" [] ()) $ + withTextProp (mknode "w:iCs" [] ()) $ -- needed for LTR, #6911 + inlinesToOpenXML opts lst inlineToOpenXML' opts (Underline lst) = withTextProp (mknode "w:u" [("w:val","single")] ()) $ inlinesToOpenXML opts lst diff --git a/test/docx/golden/custom_style_preserve.docx b/test/docx/golden/custom_style_preserve.docx index ac595cdd4..17804bb81 100644 Binary files a/test/docx/golden/custom_style_preserve.docx and b/test/docx/golden/custom_style_preserve.docx differ diff --git a/test/docx/golden/definition_list.docx b/test/docx/golden/definition_list.docx index 005f5dbe3..21629e208 100644 Binary files a/test/docx/golden/definition_list.docx and b/test/docx/golden/definition_list.docx differ diff --git a/test/docx/golden/inline_formatting.docx b/test/docx/golden/inline_formatting.docx index ce1d16daa..e12e3b38d 100644 Binary files a/test/docx/golden/inline_formatting.docx and b/test/docx/golden/inline_formatting.docx differ -- cgit v1.2.3 From 9c6cc79c11d191c74aae8646279a2b94ae253330 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 3 Dec 2020 21:24:27 -0800 Subject: EPUB writer: add frontmatter type on body element for nav.xhtml. Closes #6918. --- src/Text/Pandoc/Writers/EPUB.hs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 12004889f..eadd1100f 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -845,7 +845,9 @@ pandocToEPUB version opts doc = do [ unode "ol" landmarkItems ] | not (null landmarkItems)] navData <- lift $ writeHtml opts'{ writerVariables = - Context (M.fromList [("navpage", toVal' "true")]) + Context (M.fromList [("navpage", toVal' "true") + ,("body-type", toVal' "frontmatter") + ]) <> cssvars False <> vars } (Pandoc (setMeta "title" (walk removeNote $ fromList $ docTitle' meta) nullMeta) -- cgit v1.2.3 From 7199d68ba078148ff76a38f2c483da73edd62747 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 3 Dec 2020 21:39:19 -0800 Subject: EPUB writer: include title page in landmarks. Closes #6919. Note that the toc is also included if `--toc` is specified. --- src/Text/Pandoc/Writers/EPUB.hs | 9 +++++++-- test/command/5986.md | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index eadd1100f..5867b7c80 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -825,7 +825,12 @@ pandocToEPUB version opts doc = do [ unode "h1" ! [("id","toc-title")] $ tocTitle , unode "ol" ! [("class","toc")] $ tocBlocks ]] let landmarkItems = if epub3 - then [ unode "li" + then unode "li" + [ unode "a" ! [("href", + "text/title_page.xhtml") + ,("epub:type", "titlepage")] $ + ("Title Page" :: String) ] : + [ unode "li" [ unode "a" ! [("href", "text/cover.xhtml") ,("epub:type", "cover")] $ ("Cover" :: String)] | @@ -834,7 +839,7 @@ pandocToEPUB version opts doc = do [ unode "li" [ unode "a" ! [("href", "#toc") ,("epub:type", "toc")] $ - ("Table of contents" :: String) + ("Table of Contents" :: String) ] | writerTableOfContents opts ] else [] diff --git a/test/command/5986.md b/test/command/5986.md index e24aebb64..ea0ca70c1 100644 --- a/test/command/5986.md +++ b/test/command/5986.md @@ -6,7 +6,8 @@

-- cgit v1.2.3 From 171d3db3848a8ca79480688748d0ffff67ed2039 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 4 Dec 2020 09:47:56 -0800 Subject: HTML writer: Fix handling of nested csl- display spans. Previously inner Spans used to represent CSL display attributes were not rendered as div tags. See #6921. --- src/Text/Pandoc/Writers/HTML.hs | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index c92131d5a..76f17f77a 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -695,12 +695,12 @@ blockToHtml opts (Div (ident, "section":dclasses, dkvs) let fragmentClass = case slideVariant of RevealJsSlides -> "fragment" _ -> "incremental" - let inDiv zs = RawBlock (Format "html") ("
fragmentClass <> "\">") : (zs ++ [RawBlock (Format "html") "
"]) let breakOnPauses zs = case splitBy isPause zs of [] -> [] - y:ys -> y ++ concatMap inDiv ys + y:ys -> y ++ concatMap inDiv' ys let (titleBlocks, innerSecs) = if titleSlide -- title slides have no content of their own @@ -783,9 +783,7 @@ blockToHtml opts (Div attr@(ident, classes, kvs') bs) = do -- a newline between the column divs, which throws -- off widths! see #4028 mconcat <$> mapM (blockToHtml opts) bs - else if isCslBibEntry - then mconcat <$> mapM (cslEntryToHtml opts') bs - else blockListToHtml opts' bs + else blockListToHtml opts' bs let contents' = nl opts >> contents >> nl opts let (divtag, classes'') = if html5 && "section" `elem` classes' then (H5.section, filter (/= "section") classes') @@ -1213,6 +1211,10 @@ inlineToHtml opts inline = do LineBreak -> return $ do if html5 then H5.br else H.br strToHtml "\n" + (Span ("",[cls],[]) ils) + | cls == "csl-block" || cls == "csl-left-margin" || + cls == "csl-right-inline" || cls == "csl-indent" + -> inlineListToHtml opts ils >>= inDiv cls (Span (id',classes,kvs) ils) -> let spanLikeTag = case classes of @@ -1462,22 +1464,12 @@ blockListToNote opts ref blocks = do _ -> noteItem return $ nl opts >> noteItem' -cslEntryToHtml :: PandocMonad m - => WriterOptions - -> Block - -> StateT WriterState m Html -cslEntryToHtml opts (Para xs) = do +inDiv :: PandocMonad m=> Text -> Html -> StateT WriterState m Html +inDiv cls x = do html5 <- gets stHtml5 - let inDiv :: Text -> Html -> Html - inDiv cls x = (if html5 then H5.div else H.div) - x ! A.class_ (toValue cls) - let go (Span ("",[cls],[]) ils) - | cls == "csl-block" || cls == "csl-left-margin" || - cls == "csl-right-inline" || cls == "csl-indent" - = inDiv cls <$> inlineListToHtml opts ils - go il = inlineToHtml opts il - mconcat <$> mapM go xs -cslEntryToHtml opts x = blockToHtml opts x + return $ + (if html5 then H5.div else H.div) + x ! A.class_ (toValue cls) isMathEnvironment :: Text -> Bool isMathEnvironment s = "\\begin{" `T.isPrefixOf` s && -- cgit v1.2.3 From 68bcddeb2136242aabee716420ecad251fc38920 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 4 Dec 2020 10:14:19 -0800 Subject: LaTeX writer: Fix bug with nested csl- display Spans. See #6921. --- src/Text/Pandoc/Writers/LaTeX.hs | 68 +++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 36 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs index d665269ab..e02cc2833 100644 --- a/src/Text/Pandoc/Writers/LaTeX.hs +++ b/src/Text/Pandoc/Writers/LaTeX.hs @@ -553,9 +553,7 @@ blockToLaTeX (Div (identifier,classes,kvs) bs) = do Just s -> braces (literal s)) $$ inner $+$ "\\end{CSLReferences}" - else if "csl-entry" `elem` classes - then vcat <$> mapM cslEntryToLaTeX bs - else blockListToLaTeX bs + else blockListToLaTeX bs modify $ \st -> st{ stIncremental = oldIncremental } linkAnchor' <- hypertarget True identifier empty -- see #2704 for the motivation for adding \leavevmode: @@ -1180,23 +1178,6 @@ isQuoted :: Inline -> Bool isQuoted (Quoted _ _) = True isQuoted _ = False -cslEntryToLaTeX :: PandocMonad m - => Block - -> LW m (Doc Text) -cslEntryToLaTeX (Para xs) = - mconcat <$> mapM go xs - where - go (Span ("",["csl-block"],[]) ils) = - (cr <>) . inCmd "CSLBlock" <$> inlineListToLaTeX ils - go (Span ("",["csl-left-margin"],[]) ils) = - inCmd "CSLLeftMargin" <$> inlineListToLaTeX ils - go (Span ("",["csl-right-inline"],[]) ils) = - (cr <>) . inCmd "CSLRightInline" <$> inlineListToLaTeX ils - go (Span ("",["csl-indent"],[]) ils) = - (cr <>) . inCmd "CSLIndent" <$> inlineListToLaTeX ils - go il = inlineToLaTeX il -cslEntryToLaTeX x = blockToLaTeX x - -- | Convert inline element to LaTeX inlineToLaTeX :: PandocMonad m => Inline -- ^ Inline to convert @@ -1204,23 +1185,38 @@ inlineToLaTeX :: PandocMonad m inlineToLaTeX (Span (id',classes,kvs) ils) = do linkAnchor <- hypertarget False id' empty lang <- toLang $ lookup "lang" kvs - let cmds = ["textup" | "csl-no-emph" `elem` classes] ++ - ["textnormal" | "csl-no-strong" `elem` classes || - "csl-no-smallcaps" `elem` classes] ++ - ["RL" | ("dir", "rtl") `elem` kvs] ++ - ["LR" | ("dir", "ltr") `elem` kvs] ++ - (case lang of - Just lng -> let (l, o) = toPolyglossia lng - ops = if T.null o then "" else "[" <> o <> "]" - in ["text" <> l <> ops] - Nothing -> []) + let classToCmd "csl-no-emph" = Just "textup" + classToCmd "csl-no-strong" = Just "textnormal" + classToCmd "csl-no-smallcaps" = Just "textnormal" + classToCmd "csl-block" = Just "CSLBlock" + classToCmd "csl-left-margin" = Just "CSLLeftMargin" + classToCmd "csl-right-inline" = Just "CSLRightInline" + classToCmd "csl-indent" = Just "CSLIndent" + classToCmd _ = Nothing + kvToCmd ("dir","rtl") = Just "RL" + kvToCmd ("dir","ltr") = Just "LR" + kvToCmd _ = Nothing + langCmds = + case lang of + Just lng -> let (l, o) = toPolyglossia lng + ops = if T.null o then "" else "[" <> o <> "]" + in ["text" <> l <> ops] + Nothing -> [] + let cmds = mapMaybe classToCmd classes ++ mapMaybe kvToCmd kvs ++ langCmds contents <- inlineListToLaTeX ils - return $ (if T.null id' - then empty - else "\\protect" <> linkAnchor) <> - (if null cmds - then braces contents - else foldr inCmd contents cmds) + return $ + (case classes of + ["csl-block"] -> (cr <>) + ["csl-left-margin"] -> (cr <>) + ["csl-right-inline"] -> (cr <>) + ["csl-indent"] -> (cr <>) + _ -> id) $ + (if T.null id' + then empty + else "\\protect" <> linkAnchor) <> + (if null cmds + then braces contents + else foldr inCmd contents cmds) inlineToLaTeX (Emph lst) = inCmd "emph" <$> inlineListToLaTeX lst inlineToLaTeX (Underline lst) = inCmd "underline" <$> inlineListToLaTeX lst inlineToLaTeX (Strong lst) = inCmd "textbf" <$> inlineListToLaTeX lst -- cgit v1.2.3 From dc3ef5201f9531bc405ac07e763d9f004bb6bc91 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 4 Dec 2020 10:55:48 -0800 Subject: Markdown writer: ensure that a new csl-block begins on a new line. This just looks better and doesn't affect the semantics. See #6921. --- src/Text/Pandoc/Writers/Markdown.hs | 7 ++++++- test/command/pandoc-citeproc-53.md | 6 ++++-- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index 6aec6b244..5eb47b261 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -1058,7 +1058,12 @@ inlineToMarkdown opts (Span ("",["emoji"],kvs) [Str s]) = inlineToMarkdown opts (Span attrs ils) = do variant <- asks envVariant contents <- inlineListToMarkdown opts ils - return $ case variant of + return $ case attrs of + (_,["csl-block"],_) -> (cr <>) + (_,["csl-left-margin"],_) -> (cr <>) + (_,["csl-indent"],_) -> (cr <>) + _ -> id + $ case variant of PlainText -> contents _ | attrs == nullAttr -> contents | isEnabled Ext_bracketed_spans opts -> diff --git a/test/command/pandoc-citeproc-53.md b/test/command/pandoc-citeproc-53.md index 295f52049..fb8d5c35e 100644 --- a/test/command/pandoc-citeproc-53.md +++ b/test/command/pandoc-citeproc-53.md @@ -28,11 +28,13 @@ Doe[^1] Doe[^2] Roe[^3] Roe[^4] Doe[^5] Doe[^6] Roe[^7] Roe[^8] ::: {#refs .references .csl-bib-body} ::: {#ref-a .csl-entry} -[[Doe J.]{.smallcaps} ]{.csl-block}[2000, *Work A*,.]{.csl-left-margin} +[[Doe J.]{.smallcaps} ]{.csl-block} +[2000, *Work A*,.]{.csl-left-margin} ::: ::: {#ref-b .csl-entry} -[[Roe J.]{.smallcaps} ]{.csl-block}[1990, *Work B*,.]{.csl-left-margin} +[[Roe J.]{.smallcaps} ]{.csl-block} +[1990, *Work B*,.]{.csl-left-margin} ::: ::: -- cgit v1.2.3 From 6f35600204132c5a0907bb7cccb2e10ffb27506a Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sat, 5 Dec 2020 06:00:21 +0100 Subject: Docbook writer: add XML namespaces to top-level elements (#6923) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, we only added xmlns attributes to chapter elements, even when running with --top-level-division=section. Let’s add the namespaces to part and section elements too, when they are the selected top-level divisions. We do not need to add namespaces to documents produced with --standalone flag, since those will already have xmlns attribute on the root element in the template. --- src/Text/Pandoc/Writers/Docbook.hs | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index 408d8cc0c..3f4c67f10 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -15,6 +15,7 @@ Conversion of 'Pandoc' documents to Docbook XML. module Text.Pandoc.Writers.Docbook ( writeDocbook4, writeDocbook5 ) where import Control.Monad.Reader import Data.Generics (everywhere, mkT) +import Data.Maybe (isNothing) import Data.Monoid (Any (..)) import Data.Text (Text) import qualified Data.Text as T @@ -40,6 +41,18 @@ data DocBookVersion = DocBook4 | DocBook5 type DB = ReaderT DocBookVersion +-- | Get level of the top-level headers based on the configured top-level division. +-- The header level can then be used to determine appropriate DocBook element +-- for each subdivision associated with a header. +-- The numbering here follows LaTeX's internal numbering +getStartLvl :: WriterOptions -> Int +getStartLvl opts = + case writerTopLevelDivision opts of + TopLevelPart -> -1 + TopLevelChapter -> 0 + TopLevelSection -> 1 + TopLevelDefault -> 1 + -- | Convert list of authors to a docbook section authorToDocbook :: PandocMonad m => WriterOptions -> [Inline] -> DB m B.Inlines authorToDocbook opts name' = do @@ -79,12 +92,7 @@ writeDocbook opts (Pandoc meta blocks) = do let colwidth = if writerWrapText opts == WrapAuto then Just $ writerColumns opts else Nothing - -- The numbering here follows LaTeX's internal numbering - let startLvl = case writerTopLevelDivision opts of - TopLevelPart -> -1 - TopLevelChapter -> 0 - TopLevelSection -> 1 - TopLevelDefault -> 1 + let startLvl = getStartLvl opts let fromBlocks = blocksToDocbook opts . makeSections False (Just startLvl) auths' <- mapM (authorToDocbook opts) $ docAuthors meta @@ -170,8 +178,12 @@ blockToDocbook opts (Div (id',"section":_,_) (Header lvl _ ils : xs)) = do then "xml:id" else "id" idAttr = [(idName, writerIdentifierPrefix opts <> id') | not (T.null id')] - nsAttr = if version == DocBook5 && lvl == 0 then [("xmlns", "http://docbook.org/ns/docbook"),("xmlns:xlink", "http://www.w3.org/1999/xlink")] - else [] + -- We want to add namespaces to the root (top-level) element. + nsAttr = if version == DocBook5 && lvl == getStartLvl opts && isNothing (writerTemplate opts) + -- Though, DocBook 4 does not support namespaces and + -- standalone documents will include them in the template. + then [("xmlns", "http://docbook.org/ns/docbook"),("xmlns:xlink", "http://www.w3.org/1999/xlink")] + else [] attribs = nsAttr <> idAttr title' <- inlinesToDocbook opts ils contents <- blocksToDocbook opts bs -- cgit v1.2.3 From ddb76cb356a82f6a9e51a6f3626dd154816e9205 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 5 Dec 2020 09:53:39 -0800 Subject: LaTeX reader: don't apply theorem default styling to a figure inside. If we put an image in italics, then when rendering to Markdown we no longer get an implicit figure. Closes #6925. --- src/Text/Pandoc/Readers/LaTeX.hs | 1 + test/command/6925.md | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 test/command/6925.md (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 15a1a19fc..afe960454 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1812,6 +1812,7 @@ theoremEnvironment name = do _ -> bs italicize :: Block -> Block +italicize x@(Para [Image{}]) = x -- see #6925 italicize (Para ils) = Para [Emph ils] italicize (Plain ils) = Plain [Emph ils] italicize x = x diff --git a/test/command/6925.md b/test/command/6925.md new file mode 100644 index 000000000..458a0b91d --- /dev/null +++ b/test/command/6925.md @@ -0,0 +1,34 @@ +``` +% pandoc -f latex -t markdown +\documentclass{amsart} +\newtheorem{thm}{Theorem}[section] +\theoremstyle{definition} +\newtheorem{thm2}[section]{Theorem} +\begin{document} +\begin{thm} +a +\begin{figure} +\includegraphics[]{1.png} +\end{figure} +\end{thm} + +\begin{thm2} +a +\begin{figure} +\includegraphics[]{1.png} +\end{figure} +\end{thm2} +\end{document} +^D +::: {.thm} +**Theorem 1**. *a* + +![image](1.png) +::: + +::: {.thm2} +**Theorem 1**. a + +![image](1.png) +::: +``` -- cgit v1.2.3 From c161893f442a3e001b64af1421e9f62376d71c92 Mon Sep 17 00:00:00 2001 From: Nils Carlson Date: Sat, 5 Dec 2020 18:00:04 +0000 Subject: OpenDocument writer: Allow references for internal links (#6774) This commit adds two extensions to the OpenDocument writer, `xrefs_name` and `xrefs_number`. Links to headings, figures and tables inside the document are substituted with cross-references that will use the name or caption of the referenced item for `xrefs_name` or the number for `xrefs_number`. For the `xrefs_number` to be useful heading numbers must be enabled in the generated document and table and figure captions must be enabled using for example the `native_numbering` extension. In order for numbers and reference text to be updated the generated document must be refreshed. Co-authored-by: Nils Carlson --- MANUAL.txt | 37 ++++++++++++++ src/Text/Pandoc/Extensions.hs | 4 ++ src/Text/Pandoc/Writers/OpenDocument.hs | 91 ++++++++++++++++++++++++++------- test/command/6774.md | 63 +++++++++++++++++++++++ 4 files changed, 177 insertions(+), 18 deletions(-) create mode 100644 test/command/6774.md (limited to 'src/Text') diff --git a/MANUAL.txt b/MANUAL.txt index fad885fed..4e1615ff2 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -3027,6 +3027,43 @@ This extension can be enabled/disabled for the following formats: output formats : `odt`, `opendocument` +#### Extension: `xrefs_name` #### + +Links to headings, figures and tables inside the document are +substituted with cross-references that will use the name or caption +of the referenced item. The original link text is replaced once +the generated document is refreshed. This extension can be combined +with `xrefs_number` in which case numbers will appear before the +name. + +Text in cross-references is only made consistent with the referenced +item once the document has been refreshed. + +This extension can be enabled/disabled for the following formats: + +output formats +: `odt`, `opendocument` + +#### Extension: `xrefs_number` #### + +Links to headings, figures and tables inside the document are +substituted with cross-references that will use the number +of the referenced item. The original link text is discarded. +This extension can be combined with `xrefs_name` in which case +the name or caption numbers will appear after the number. + +For the `xrefs_number` to be useful heading numbers must be enabled +in the generated document, also table and figure captions must be enabled +using for example the `native_numbering` extension. + +Numbers in cross-references are only visible in the final document once +it has been refreshed. + +This extension can be enabled/disabled for the following formats: + +output formats +: `odt`, `opendocument` + #### Extension: `styles` #### {#ext-styles} When converting from docx, read all docx styles as divs (for diff --git a/src/Text/Pandoc/Extensions.hs b/src/Text/Pandoc/Extensions.hs index 646f7abfb..a94e24e2c 100644 --- a/src/Text/Pandoc/Extensions.hs +++ b/src/Text/Pandoc/Extensions.hs @@ -149,6 +149,8 @@ data Extension = | Ext_tex_math_dollars -- ^ TeX math between $..$ or $$..$$ | Ext_tex_math_double_backslash -- ^ TeX math btw \\(..\\) \\[..\\] | Ext_tex_math_single_backslash -- ^ TeX math btw \(..\) \[..\] + | Ext_xrefs_name -- ^ Use xrefs with names + | Ext_xrefs_number -- ^ Use xrefs with numbers | Ext_yaml_metadata_block -- ^ YAML metadata block | Ext_gutenberg -- ^ Use Project Gutenberg conventions for plain | Ext_attributes -- ^ Generic attribute syntax @@ -465,6 +467,8 @@ getAllExtensions f = universalExtensions <> getAll f getAll "opendocument" = extensionsFromList [ Ext_empty_paragraphs , Ext_native_numbering + , Ext_xrefs_name + , Ext_xrefs_number ] getAll "odt" = getAll "opendocument" <> autoIdExtensions getAll "muse" = autoIdExtensions <> diff --git a/src/Text/Pandoc/Writers/OpenDocument.hs b/src/Text/Pandoc/Writers/OpenDocument.hs index 071a5542f..cf42f2228 100644 --- a/src/Text/Pandoc/Writers/OpenDocument.hs +++ b/src/Text/Pandoc/Writers/OpenDocument.hs @@ -17,6 +17,7 @@ module Text.Pandoc.Writers.OpenDocument ( writeOpenDocument ) where import Control.Arrow ((***), (>>>)) import Control.Monad.State.Strict hiding (when) import Data.Char (chr) +import Data.Foldable (find) import Data.List (sortOn, sortBy, foldl') import qualified Data.Map as Map import Data.Maybe (fromMaybe, isNothing) @@ -35,6 +36,7 @@ import Text.DocLayout import Text.Pandoc.Shared (linesToPara, tshow, blocksToInlines) import Text.Pandoc.Templates (renderTemplate) import qualified Text.Pandoc.Translations as Term (Term(Figure, Table)) +import Text.Pandoc.Walk import Text.Pandoc.Writers.Math import Text.Pandoc.Writers.Shared import qualified Text.Pandoc.Writers.AnnotatedTable as Ann @@ -54,6 +56,11 @@ plainToPara x = x type OD m = StateT WriterState m +data ReferenceType + = HeaderRef + | TableRef + | ImageRef + data WriterState = WriterState { stNotes :: [Doc Text] , stTableStyles :: [Doc Text] @@ -69,6 +76,7 @@ data WriterState = , stImageId :: Int , stTableCaptionId :: Int , stImageCaptionId :: Int + , stIdentTypes :: [(Text,ReferenceType)] } defaultWriterState :: WriterState @@ -86,6 +94,7 @@ defaultWriterState = , stImageId = 1 , stTableCaptionId = 1 , stImageCaptionId = 1 + , stIdentTypes = [] } when :: Bool -> Doc Text -> Doc Text @@ -243,6 +252,12 @@ writeOpenDocument opts (Pandoc meta blocks) = do meta ((body, metadata),s) <- flip runStateT defaultWriterState $ do + let collectInlineIdent (Image (ident,_,_) _ _) = [(ident,ImageRef)] + collectInlineIdent _ = [] + let collectBlockIdent (Header _ (ident,_,_) _) = [(ident,HeaderRef)] + collectBlockIdent (Table (ident,_,_) _ _ _ _ _) = [(ident,TableRef)] + collectBlockIdent _ = [] + modify $ \s -> s{ stIdentTypes = query collectBlockIdent blocks ++ query collectInlineIdent blocks } m <- metaToContext opts (blocksToOpenDocument opts) (fmap chomp . inlinesToOpenDocument opts) @@ -411,7 +426,7 @@ blockToOpenDocument o bs inTags True "text:list" [ ("text:style-name", "L" <> tshow ln)] <$> orderedListToOpenDocument o pn b table :: PandocMonad m => Ann.Table -> OD m (Doc Text) - table (Ann.Table _ (Caption _ c) colspecs thead tbodies _) = do + table (Ann.Table (ident, _, _) (Caption _ c) colspecs thead tbodies _) = do tn <- length <$> gets stTableStyles pn <- length <$> gets stParaStyles let genIds = map chr [65..] @@ -433,7 +448,7 @@ blockToOpenDocument o bs then return empty else inlinesToOpenDocument o (blocksToInlines c) >>= if isEnabled Ext_native_numbering o - then numberedTableCaption + then numberedTableCaption ident else unNumberedCaption "TableCaption" th <- colHeadsToOpenDocument o (map fst paraHStyles) thead tr <- mapM (tableBodyToOpenDocument o (map fst paraStyles)) tbodies @@ -442,36 +457,39 @@ blockToOpenDocument o bs , ("table:style-name", name) ] (vcat columns $$ th $$ vcat tr) return $ captionDoc $$ tableDoc - figure attr caption source title | null caption = + figure attr@(ident, _, _) caption source title | null caption = withParagraphStyle o "Figure" [Para [Image attr caption (source,title)]] | otherwise = do imageDoc <- withParagraphStyle o "FigureWithCaption" [Para [Image attr caption (source,title)]] captionDoc <- inlinesToOpenDocument o caption >>= if isEnabled Ext_native_numbering o - then numberedFigureCaption + then numberedFigureCaption ident else unNumberedCaption "FigureCaption" return $ imageDoc $$ captionDoc -numberedTableCaption :: PandocMonad m => Doc Text -> OD m (Doc Text) -numberedTableCaption caption = do +numberedTableCaption :: PandocMonad m => Text -> Doc Text -> OD m (Doc Text) +numberedTableCaption ident caption = do id' <- gets stTableCaptionId modify (\st -> st{ stTableCaptionId = id' + 1 }) capterm <- translateTerm Term.Table - return $ numberedCaption "TableCaption" capterm "Table" id' caption + return $ numberedCaption "TableCaption" capterm "Table" id' ident caption -numberedFigureCaption :: PandocMonad m => Doc Text -> OD m (Doc Text) -numberedFigureCaption caption = do +numberedFigureCaption :: PandocMonad m => Text -> Doc Text -> OD m (Doc Text) +numberedFigureCaption ident caption = do id' <- gets stImageCaptionId modify (\st -> st{ stImageCaptionId = id' + 1 }) capterm <- translateTerm Term.Figure - return $ numberedCaption "FigureCaption" capterm "Illustration" id' caption + return $ numberedCaption "FigureCaption" capterm "Illustration" id' ident caption -numberedCaption :: Text -> Text -> Text -> Int -> Doc Text -> Doc Text -numberedCaption style term name num caption = +numberedCaption :: Text -> Text -> Text -> Int -> Text -> Doc Text -> Doc Text +numberedCaption style term name num ident caption = let t = text $ T.unpack term r = num - 1 - s = inTags False "text:sequence" [ ("text:ref-name", "ref" <> name <> tshow r), + ident' = case ident of + "" -> "ref" <> name <> tshow r + _ -> ident + s = inTags False "text:sequence" [ ("text:ref-name", ident'), ("text:name", name), ("text:formula", "ooow:" <> name <> "+1"), ("style:num-format", "1") ] $ text $ show num @@ -607,7 +625,9 @@ inlineToOpenDocument o ils else do report $ InlineNotRendered ils return empty - Link _ l (s,t) -> mkLink s t <$> inlinesToOpenDocument o l + Link _ l (s,t) -> do + identTypes <- gets stIdentTypes + mkLink o identTypes s t <$> inlinesToOpenDocument o l Image attr _ (s,t) -> mkImg attr s t Note l -> mkNote l where @@ -619,10 +639,6 @@ inlineToOpenDocument o ils unhighlighted s = inlinedCode $ preformatted s preformatted s = handleSpaces $ escapeStringForXML s inlinedCode s = return $ inTags False "text:span" [("text:style-name", "Source_Text")] s - mkLink s t = inTags False "text:a" [ ("xlink:type" , "simple") - , ("xlink:href" , s ) - , ("office:name", t ) - ] . inSpanTags "Definition" mkImg (_, _, kvs) s _ = do id' <- gets stImageId modify (\st -> st{ stImageId = id' + 1 }) @@ -659,6 +675,45 @@ inlineToOpenDocument o ils addNote nn return nn +mkLink :: WriterOptions -> [(Text,ReferenceType)] -> Text -> Text -> Doc Text -> Doc Text +mkLink o identTypes s t d = + let maybeIdentAndType = case T.uncons s of + Just ('#', ident) -> find ((ident ==) . fst) identTypes + _ -> Nothing + d' = inSpanTags "Definition" d + ref refType format ident = inTags False refType + [ ("text:reference-format", format ), + ("text:ref-name", ident) ] + inlineSpace = selfClosingTag "text:s" [] + bookmarkRef = ref "text:bookmark-ref" + bookmarkRefNumber ident = bookmarkRef "number" ident mempty + bookmarkRefName ident = bookmarkRef "text" ident d + bookmarkRefNameNumber ident = bookmarkRefNumber ident <> inlineSpace <> bookmarkRefName ident + bookmarkRef' + | isEnabled Ext_xrefs_number o && isEnabled Ext_xrefs_name o = bookmarkRefNameNumber + | isEnabled Ext_xrefs_name o = bookmarkRefName + | otherwise = bookmarkRefNumber + sequenceRef = ref "text:sequence-ref" + sequenceRefNumber ident = sequenceRef "value" ident mempty + sequenceRefName ident = sequenceRef "caption" ident d + sequenceRefNameNumber ident = sequenceRefNumber ident <> inlineSpace <> sequenceRefName ident + sequenceRef' + | isEnabled Ext_xrefs_number o && isEnabled Ext_xrefs_name o = sequenceRefNameNumber + | isEnabled Ext_xrefs_name o = sequenceRefName + | otherwise = sequenceRefNumber + link = inTags False "text:a" [ ("xlink:type" , "simple") + , ("xlink:href" , s ) + , ("office:name", t ) + ] d' + linkOrReference = case maybeIdentAndType of + Just (ident, HeaderRef) -> bookmarkRef' ident + Just (ident, TableRef) -> sequenceRef' ident + Just (ident, ImageRef) -> sequenceRef' ident + _ -> link + in if isEnabled Ext_xrefs_name o || isEnabled Ext_xrefs_number o + then linkOrReference + else link + bulletListStyle :: PandocMonad m => Int -> OD m (Int,(Int,[Doc Text])) bulletListStyle l = do let doStyles i = inTags True "text:list-level-style-bullet" diff --git a/test/command/6774.md b/test/command/6774.md new file mode 100644 index 000000000..66549c0f2 --- /dev/null +++ b/test/command/6774.md @@ -0,0 +1,63 @@ +``` +% pandoc -f native -t opendocument --quiet +[Header 1 ("chapter1",[],[]) [Str "The",Space,Str "Chapter"] +,Para [Str "Chapter",Space,Str "1",Space,Str "references",Space,Link ("",[],[]) [Str "The",Space,Str "Chapter"] ("#chapter1","")]] +^D +The +Chapter +Chapter 1 references +The +Chapter +``` +``` +% pandoc -f native -t opendocument+xrefs_name --quiet +[Header 1 ("chapter1",[],[]) [Str "The",Space,Str "Chapter"] +,Para [Str "Chapter",Space,Str "1",Space,Str "references",Space,Link ("",[],[]) [Str "The",Space,Str "Chapter"] ("#chapter1","")] +,Para [Image ("lalune",[],[]) [Str "lalune"] ("lalune.jpg","fig:Voyage dans la Lune")] +,Para [Str "Image",Space,Str "1",Space,Str "references",Space,Link ("",[],[]) [Str "La",Space,Str "Lune"] ("#lalune","")]] +^D +The +Chapter +Chapter 1 references +The +Chapter + +lalune +Image 1 references +La +Lune +``` +``` +% pandoc -f native -t opendocument+xrefs_number --quiet +[Header 1 ("chapter1",[],[]) [Str "The",Space,Str "Chapter"] +,Para [Str "Chapter",Space,Str "1",Space,Str "references",Space,Link ("",[],[]) [Str "The",Space,Str "Chapter"] ("#chapter1","")] +,Para [Image ("lalune",[],[]) [Str "lalune"] ("lalune.jpg","fig:Voyage dans la Lune")] +,Para [Str "Image",Space,Str "1",Space,Str "references",Space,Link ("",[],[]) [Str "La",Space,Str "Lune"] ("#lalune","")]] +^D +The +Chapter +Chapter 1 references + + +lalune +Image 1 references + +``` +``` +% pandoc -f native -t opendocument+xrefs_number+xrefs_name --quiet +[Header 1 ("chapter1",[],[]) [Str "The",Space,Str "Chapter"] +,Para [Str "Chapter",Space,Str "1",Space,Str "references",Space,Link ("",[],[]) [Str "The",Space,Str "Chapter"] ("#chapter1","")] +,Para [Image ("lalune",[],[]) [Str "lalune"] ("lalune.jpg","fig:Voyage dans la Lune")] +,Para [Str "Image",Space,Str "1",Space,Str "references",Space,Link ("",[],[]) [Str "La",Space,Str "Lune"] ("#lalune","")]] +^D +The +Chapter +Chapter 1 references +The +Chapter + +lalune +Image 1 references +La +Lune +``` -- cgit v1.2.3 From acf932825bfe40d9a18046c9d304f4f14363a88a Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Sat, 5 Dec 2020 22:05:37 +0100 Subject: Org reader: preserve targets of spurious links Links with (internal) targets that the reader doesn't know about are converted into emphasized text. Information on the link target is now preserved by wrapping the text in a Span of class `spurious-link`, with an attribute `target` set to the link's original target. This allows to recover and fix broken or unknown links with filters. See: #6916 --- src/Text/Pandoc/Readers/Org/Inlines.hs | 9 ++++----- test/Tests/Readers/Org/Meta.hs | 6 ++++-- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs index b234bee58..0330cf55f 100644 --- a/src/Text/Pandoc/Readers/Org/Inlines.hs +++ b/src/Text/Pandoc/Readers/Org/Inlines.hs @@ -477,17 +477,17 @@ linkToInlinesF linkStr = internalLink :: Text -> Inlines -> F Inlines internalLink link title = do - anchorB <- (link `elem`) <$> asksF orgStateAnchorIds - if anchorB + ids <- asksF orgStateAnchorIds + if link `elem` ids then return $ B.link ("#" <> link) "" title - else return $ B.emph title + else let attr' = ("", ["spurious-link"] , [("target", link)]) + in return $ B.spanWith attr' (B.emph title) -- | Parse an anchor like @<>@ and return an empty span with -- @anchor-id@ set as id. Legal anchors in org-mode are defined through -- @org-target-regexp@, which is fairly liberal. Since no link is created if -- @anchor-id@ contains spaces, we are more restrictive in what is accepted as -- an anchor. - anchor :: PandocMonad m => OrgParser m (F Inlines) anchor = try $ do anchorId <- parseAnchor @@ -501,7 +501,6 @@ anchor = try $ do -- | Replace every char but [a-zA-Z0-9_.-:] with a hyphen '-'. This mirrors -- the org function @org-export-solidify-link-text@. - solidify :: Text -> Text solidify = T.map replaceSpecialChar where replaceSpecialChar c diff --git a/test/Tests/Readers/Org/Meta.hs b/test/Tests/Readers/Org/Meta.hs index 041016f64..bc167f2a5 100644 --- a/test/Tests/Readers/Org/Meta.hs +++ b/test/Tests/Readers/Org/Meta.hs @@ -270,7 +270,8 @@ tests = , "Search links are read as emph" =: "[[Wally][Where's Wally?]]" =?> - para (emph $ "Where's" <> space <> "Wally?") + para (spanWith ("", ["spurious-link"], [("target", "Wally")]) + (emph $ "Where's" <> space <> "Wally?")) , "Link to nonexistent anchor" =: T.unlines [ "<> Target." @@ -278,5 +279,6 @@ tests = , "[[link$here][See here!]]" ] =?> (para (spanWith ("link-here", [], []) mempty <> "Target.") <> - para (emph ("See" <> space <> "here!"))) + para (spanWith ("", ["spurious-link"], [("target", "link$here")]) + (emph ("See" <> space <> "here!")))) ] -- cgit v1.2.3 From dc6856530c2cb6ca58ed82721ab895b86cfe0c1c Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Fri, 4 Dec 2020 09:16:56 +0100 Subject: Docbook writer: handle admonitions Similarly to https://github.com/jgm/pandoc/commit/d6fdfe6f2bba2a8ed25d6c9f11861774001f7a91, we should handle admonitions. --- src/Text/Pandoc/Writers/Docbook.hs | 42 +++++++++++++++++++++--------- test/Tests/Writers/Docbook.hs | 52 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 12 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index 3f4c67f10..da111cbc5 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -188,18 +188,36 @@ blockToDocbook opts (Div (id',"section":_,_) (Header lvl _ ils : xs)) = do title' <- inlinesToDocbook opts ils contents <- blocksToDocbook opts bs return $ inTags True tag attribs $ inTagsSimple "title" title' $$ contents -blockToDocbook opts (Div (ident,_,_) [Para lst]) = - let attribs = [("id", ident) | not (T.null ident)] in - if hasLineBreaks lst - then flush . nowrap . inTags False "literallayout" attribs - <$> inlinesToDocbook opts lst - else inTags True "para" attribs <$> inlinesToDocbook opts lst -blockToDocbook opts (Div (ident,_,_) bs) = do - contents <- blocksToDocbook opts (map plainToPara bs) - return $ - (if T.null ident - then mempty - else selfClosingTag "anchor" [("id", ident)]) $$ contents +blockToDocbook opts (Div (ident,classes,_) bs) = + let identAttribs = [("id", ident) | not (T.null ident)] + admonitions = ["attention","caution","danger","error","hint", + "important","note","tip","warning"] + in case classes of + (l:_) | l `elem` admonitions -> do + let (mTitleBs, bodyBs) = + case bs of + -- Matches AST produced by the Docbook reader. + (Div (_,["title"],_) ts : rest) -> (Just (blocksToDocbook opts ts), rest) + _ -> (Nothing, bs) + admonitionTitle <- case mTitleBs of + Nothing -> return mempty + -- id will be attached to the admonition so let’s pass empty identAttrs. + Just titleBs -> inTags False "title" [] <$> titleBs + admonitionBody <- handleDivBody [] bodyBs + return (inTags True l identAttribs (admonitionTitle $$ admonitionBody)) + _ -> handleDivBody identAttribs bs + where + handleDivBody identAttribs [Para lst] = + if hasLineBreaks lst + then flush . nowrap . inTags False "literallayout" identAttribs + <$> inlinesToDocbook opts lst + else inTags True "para" identAttribs <$> inlinesToDocbook opts lst + handleDivBody identAttribs bodyBs = do + contents <- blocksToDocbook opts (map plainToPara bodyBs) + return $ + (if null identAttribs + then mempty + else selfClosingTag "anchor" identAttribs) $$ contents blockToDocbook _ h@Header{} = do -- should be handled by Div section above, except inside lists/blockquotes report $ BlockNotRendered h diff --git a/test/Tests/Writers/Docbook.hs b/test/Tests/Writers/Docbook.hs index f6a047b0b..1d53dcfe7 100644 --- a/test/Tests/Writers/Docbook.hs +++ b/test/Tests/Writers/Docbook.hs @@ -70,6 +70,58 @@ tests = [ testGroup "line blocks" , "" ] ) ] + , testGroup "divs" + [ "admonition" =: divWith ("foo", ["warning"], []) (para "This is a test") + =?> unlines + [ "" + , " " + , " This is a test" + , " " + , "" + ] + , "admonition-with-title" =: + divWith ("foo", ["attention"], []) ( + divWith ("foo", ["title"], []) + (plain (text "This is title")) <> + para "This is a test" + ) + =?> unlines + [ "" + , " This is title" + , " " + , " This is a test" + , " " + , "" + ] + , "single-child" =: + divWith ("foo", [], []) (para "This is a test") + =?> unlines + [ "" + , " This is a test" + , "" + ] + , "single-literal-child" =: + divWith ("foo", [], []) lineblock + =?> unlines + [ "some text" + , "and more lines" + , "and again" + ] + , "multiple-children" =: + divWith ("foo", [], []) ( + para "This is a test" <> + para "This is an another test" + ) + =?> unlines + [ "" + , "" + , " This is a test" + , "" + , "" + , " This is an another test" + , "" + ] + ] , testGroup "compact lists" [ testGroup "bullet" [ "compact" =: bulletList [plain "a", plain "b", plain "c"] -- cgit v1.2.3 From 16ef87745702f69d5aa948fbe6d2101577dee8f4 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Fri, 4 Dec 2020 09:28:32 +0100 Subject: Docbook writer: Use correct id attribute consistently MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DocBook5 should always use xml:id instead of id so let’s use it everywhere. --- src/Text/Pandoc/Writers/Docbook.hs | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index da111cbc5..398920839 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -53,6 +53,13 @@ getStartLvl opts = TopLevelSection -> 1 TopLevelDefault -> 1 +-- | Get correct name for the id attribute based on DocBook version. +-- DocBook 4 used custom id attribute but DocBook 5 adopted the xml:id specification. +-- https://www.w3.org/TR/xml-id/ +idName :: DocBookVersion -> Text +idName DocBook5 = "xml:id" +idName DocBook4 = "id" + -- | Convert list of authors to a docbook section authorToDocbook :: PandocMonad m => WriterOptions -> [Inline] -> DB m B.Inlines authorToDocbook opts name' = do @@ -174,10 +181,7 @@ blockToDocbook opts (Div (id',"section":_,_) (Header lvl _ ils : xs)) = do then "section" else "sect" <> tshow n _ -> "simplesect" - idName = if version == DocBook5 - then "xml:id" - else "id" - idAttr = [(idName, writerIdentifierPrefix opts <> id') | not (T.null id')] + idAttr = [(idName version, writerIdentifierPrefix opts <> id') | not (T.null id')] -- We want to add namespaces to the root (top-level) element. nsAttr = if version == DocBook5 && lvl == getStartLvl opts && isNothing (writerTemplate opts) -- Though, DocBook 4 does not support namespaces and @@ -188,11 +192,12 @@ blockToDocbook opts (Div (id',"section":_,_) (Header lvl _ ils : xs)) = do title' <- inlinesToDocbook opts ils contents <- blocksToDocbook opts bs return $ inTags True tag attribs $ inTagsSimple "title" title' $$ contents -blockToDocbook opts (Div (ident,classes,_) bs) = - let identAttribs = [("id", ident) | not (T.null ident)] +blockToDocbook opts (Div (ident,classes,_) bs) = do + version <- ask + let identAttribs = [(idName version, ident) | not (T.null ident)] admonitions = ["attention","caution","danger","error","hint", "important","note","tip","warning"] - in case classes of + case classes of (l:_) | l `elem` admonitions -> do let (mTitleBs, bodyBs) = case bs of @@ -371,11 +376,12 @@ inlineToDocbook opts (Quoted _ lst) = inTagsSimple "quote" <$> inlinesToDocbook opts lst inlineToDocbook opts (Cite _ lst) = inlinesToDocbook opts lst -inlineToDocbook opts (Span (ident,_,_) ils) = +inlineToDocbook opts (Span (ident,_,_) ils) = do + version <- ask ((if T.null ident then mempty - else selfClosingTag "anchor" [("id", ident)]) <>) <$> - inlinesToDocbook opts ils + else selfClosingTag "anchor" [(idName version, ident)]) <>) <$> + inlinesToDocbook opts ils inlineToDocbook _ (Code _ str) = return $ inTagsSimple "literal" $ literal (escapeStringForXML str) inlineToDocbook opts (Math t str) -- cgit v1.2.3 From 70c7c5703afcbd1cbf2a80c2be515e038abcd419 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Mon, 7 Dec 2020 07:28:39 +0100 Subject: Docbook writer: Handle admonition titles from Markdown reader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Docbook reader produces a `Div` with `title` class for `` element within an “admonition” element. Markdown writer then turns this into a fenced div with `title` class attribute. Since fenced divs are block elements, their content is recognized as a paragraph by the Markdown reader. This is an issue for Docbook writer because it would produce an invalid DocBook document from such AST – the `<title>` element can only contain “inline” elements. Let’s handle this invalid special case separately by unwrapping the paragraph before creating the `<title>` element. --- src/Text/Pandoc/Writers/Docbook.hs | 2 ++ test/Tests/Writers/Docbook.hs | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index 398920839..affa0de04 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -201,6 +201,8 @@ blockToDocbook opts (Div (ident,classes,_) bs) = do (l:_) | l `elem` admonitions -> do let (mTitleBs, bodyBs) = case bs of + -- Matches AST produced by the DocBook reader → Markdown writer → Markdown reader chain. + (Div (_,["title"],_) [Para ts] : rest) -> (Just (inlinesToDocbook opts ts), rest) -- Matches AST produced by the Docbook reader. (Div (_,["title"],_) ts : rest) -> (Just (blocksToDocbook opts ts), rest) _ -> (Nothing, bs) diff --git a/test/Tests/Writers/Docbook.hs b/test/Tests/Writers/Docbook.hs index 1d53dcfe7..621c1280b 100644 --- a/test/Tests/Writers/Docbook.hs +++ b/test/Tests/Writers/Docbook.hs @@ -93,6 +93,20 @@ tests = [ testGroup "line blocks" , " </para>" , "</attention>" ] + , "admonition-with-title-in-para" =: + divWith ("foo", ["attention"], []) ( + divWith ("foo", ["title"], []) + (para "This is title") <> + para "This is a test" + ) + =?> unlines + [ "<attention id=\"foo\">" + , " <title>This is title" + , " " + , " This is a test" + , " " + , "" + ] , "single-child" =: divWith ("foo", [], []) (para "This is a test") =?> unlines -- cgit v1.2.3 From 501ea7f0c4735acdf1457da44fe04d811ac776d7 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 7 Dec 2020 12:15:14 -0800 Subject: Dokuwiki reader: handle unknown interwiki links better. DokuWiki lets the user define his own Interwiki links. Previously pandoc reacted to these by emitting a google search link, which is not helpful. Instead, we now just emit the full URL including the wikilink prefix, e.g. `faquk>FAQ-mathml`. This at least gives users the ability to modify the links using filters. Closes #6932. --- src/Text/Pandoc/Readers/DokuWiki.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/DokuWiki.hs b/src/Text/Pandoc/Readers/DokuWiki.hs index 336be09e5..dedc1f03f 100644 --- a/src/Text/Pandoc/Readers/DokuWiki.hs +++ b/src/Text/Pandoc/Readers/DokuWiki.hs @@ -317,7 +317,7 @@ interwikiToUrl "wpes" page = "https://es.wikipedia.org/wiki/" <> page interwikiToUrl "wpfr" page = "https://fr.wikipedia.org/wiki/" <> page interwikiToUrl "wpjp" page = "https://jp.wikipedia.org/wiki/" <> page interwikiToUrl "wppl" page = "https://pl.wikipedia.org/wiki/" <> page -interwikiToUrl _ page = "https://www.google.com/search?q=" <> page <> "&btnI=lucky" +interwikiToUrl unknown page = unknown <> ">" <> page linkText :: PandocMonad m => DWParser m B.Inlines linkText = parseLink fromRaw "[[" "]]" -- cgit v1.2.3 From f2749ba6cd0ec2473332394bfbb6f479b667f35c Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 7 Dec 2020 12:56:03 -0800 Subject: Parsing: in nonspaceChar use satisfy instead of oneOf. For efficiency. --- src/Text/Pandoc/Parsing.hs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 4bae8942b..1d9e182c5 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -443,7 +443,13 @@ spaceChar = satisfy $ \c -> c == ' ' || c == '\t' -- | Parses a nonspace, nonnewline character. nonspaceChar :: Stream s m Char => ParserT s st m Char -nonspaceChar = noneOf ['\t', '\n', ' ', '\r'] +nonspaceChar = satisfy (not . isSpaceChar) + where + isSpaceChar ' ' = True + isSpaceChar '\t' = True + isSpaceChar '\n' = True + isSpaceChar '\r' = True + isSpaceChar _ = False -- | Skips zero or more spaces or tabs. skipSpaces :: Stream s m Char => ParserT s st m () -- cgit v1.2.3 From 2f9b684b3a793896bc28a79a07722415cfdc075e Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 7 Dec 2020 13:01:30 -0800 Subject: Bibtex parser: avoid noneOf. --- src/Text/Pandoc/Citeproc/BibTeX.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Citeproc/BibTeX.hs b/src/Text/Pandoc/Citeproc/BibTeX.hs index 552339df0..ed723a11c 100644 --- a/src/Text/Pandoc/Citeproc/BibTeX.hs +++ b/src/Text/Pandoc/Citeproc/BibTeX.hs @@ -601,7 +601,7 @@ inBraces :: BibParser Text inBraces = do char '{' res <- manyTill - ( (T.pack <$> many1 (noneOf "{}\\")) + ( take1WhileP (\c -> c /= '{' && c /= '}' && c /= '\\') <|> (char '\\' >> ( (char '{' >> return "\\{") <|> (char '}' >> return "\\}") <|> return "\\")) @@ -616,7 +616,7 @@ inQuotes :: BibParser Text inQuotes = do char '"' T.concat <$> manyTill - ( (T.pack <$> many1 (noneOf "\"\\{")) + ( take1WhileP (\c -> c /= '{' && c /= '"' && c /= '\\') <|> (char '\\' >> T.cons '\\' . T.singleton <$> anyChar) <|> braced <$> inBraces ) (char '"') -- cgit v1.2.3 From ce1791913da713ec35f514006bb532cb9c9c8d22 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 7 Dec 2020 13:24:19 -0800 Subject: Small efficiency improvement in uri parser --- src/Text/Pandoc/Parsing.hs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 1d9e182c5..1e8518c90 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -654,7 +654,20 @@ uri = try $ do let uri' = scheme <> ":" <> fromEntities str' return (uri', escapeURI uri') where - wordChar = alphaNum <|> oneOf "#$%+/@\\_-&=" + isWordChar '#' = True + isWordChar '$' = True + isWordChar '%' = True + isWordChar '+' = True + isWordChar '/' = True + isWordChar '@' = True + isWordChar '\\' = True + isWordChar '_' = True + isWordChar '-' = True + isWordChar '&' = True + isWordChar '=' = True + isWordChar c = isAlphaNum c + + wordChar = satisfy isWordChar percentEscaped = try $ (:) <$> char '%' <*> many1 hexDigit entity = try $ pure <$> characterReference punct = try $ many1 (char ',') <|> fmap pure (satisfy (\c -> not (isSpace c) && c /= '<' && c /= '>')) -- cgit v1.2.3 From 0fa1023b9ef0f39be013befa9ba3d61b81d95928 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 7 Dec 2020 18:57:09 -0800 Subject: Parsing: More minor performance improvements. --- src/Text/Pandoc/Parsing.hs | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 1e8518c90..4ea0d788c 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -190,7 +190,7 @@ where import Control.Monad.Identity import Control.Monad.Reader -import Data.Char (chr, isAlphaNum, isAscii, isAsciiUpper, +import Data.Char (chr, isAlphaNum, isAscii, isAsciiUpper, isAsciiLower, isPunctuation, isSpace, ord, toLower, toUpper) import Data.Default import Data.Functor (($>)) @@ -444,12 +444,13 @@ spaceChar = satisfy $ \c -> c == ' ' || c == '\t' -- | Parses a nonspace, nonnewline character. nonspaceChar :: Stream s m Char => ParserT s st m Char nonspaceChar = satisfy (not . isSpaceChar) - where - isSpaceChar ' ' = True - isSpaceChar '\t' = True - isSpaceChar '\n' = True - isSpaceChar '\r' = True - isSpaceChar _ = False + +isSpaceChar :: Char -> Bool +isSpaceChar ' ' = True +isSpaceChar '\t' = True +isSpaceChar '\n' = True +isSpaceChar '\r' = True +isSpaceChar _ = False -- | Skips zero or more spaces or tabs. skipSpaces :: Stream s m Char => ParserT s st m () @@ -682,7 +683,9 @@ mathInlineWith :: Stream s m Char => Text -> Text -> ParserT s st m Text mathInlineWith op cl = try $ do textStr op when (op == "$") $ notFollowedBy space - words' <- many1Till (countChar 1 (noneOf " \t\n\\") + words' <- many1Till ( + (T.singleton <$> + satisfy (\c -> not (isSpaceChar c || c == '\\'))) <|> (char '\\' >> -- This next clause is needed because \text{..} can -- contain $, \(\), etc. @@ -840,13 +843,13 @@ defaultNum = do -- | Parses a lowercase letter and returns (LowerAlpha, number). lowerAlpha :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) lowerAlpha = do - ch <- oneOf ['a'..'z'] + ch <- satisfy isAsciiLower return (LowerAlpha, ord ch - ord 'a' + 1) -- | Parses an uppercase letter and returns (UpperAlpha, number). upperAlpha :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) upperAlpha = do - ch <- oneOf ['A'..'Z'] + ch <- satisfy isAsciiUpper return (UpperAlpha, ord ch - ord 'A' + 1) -- | Parses a roman numeral i or I -- cgit v1.2.3 From 5990cbb150f3581ab9b2f3d3a726c73dcadfd793 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 7 Dec 2020 21:34:23 -0800 Subject: Parsing: Small code improvements. --- src/Text/Pandoc/Parsing.hs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 4ea0d788c..979344f63 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -641,7 +641,7 @@ uri = try $ do scheme <- uriScheme char ':' -- Avoid parsing e.g. "**Notes:**" as a raw URI: - notFollowedBy (oneOf "*_]") + notFollowedBy $ satisfy (\c -> c == '*' || c == '_' || c == ']') -- We allow sentence punctuation except at the end, since -- we don't want the trailing '.' in 'http://google.com.' We want to allow -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation) @@ -693,7 +693,7 @@ mathInlineWith op cl = try $ do (("\\text" <>) <$> inBalancedBraces 0 "")) <|> (\c -> T.pack ['\\',c]) <$> anyChar)) <|> do (blankline <* notFollowedBy' blankline) <|> - (oneOf " \t" <* skipMany (oneOf " \t")) + (spaceChar <* skipMany spaceChar) notFollowedBy (char '$') return " " ) (try $ textStr cl) @@ -723,7 +723,8 @@ mathInlineWith op cl = try $ do mathDisplayWith :: Stream s m Char => Text -> Text -> ParserT s st m Text mathDisplayWith op cl = try $ fmap T.pack $ do textStr op - many1Till (noneOf "\n" <|> (newline <* notFollowedBy' blankline)) (try $ textStr cl) + many1Till (satisfy (/= '\n') <|> (newline <* notFollowedBy' blankline)) + (try $ textStr cl) mathDisplay :: (HasReaderOptions st, Stream s m Char) => ParserT s st m Text -- cgit v1.2.3 From 8c9010864cd818031d7eff161a57459709751517 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 9 Dec 2020 21:05:40 -0800 Subject: Commonmark reader: refactor specFor, set input name to "". --- src/Text/Pandoc/Readers/CommonMark.hs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs index c1773eaab..d32a38342 100644 --- a/src/Text/Pandoc/Readers/CommonMark.hs +++ b/src/Text/Pandoc/Readers/CommonMark.hs @@ -1,4 +1,5 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Readers.CommonMark @@ -27,15 +28,20 @@ import Text.Pandoc.Options import Text.Pandoc.Error import Control.Monad.Except import Data.Functor.Identity (runIdentity) +import Data.Typeable -- | Parse a CommonMark formatted string into a 'Pandoc' structure. readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc readCommonMark opts s = do - let res = runIdentity $ - commonmarkWith (foldr ($) defaultSyntaxSpec exts) "input" s + let res = runIdentity $ commonmarkWith (specFor opts) "" s case res of Left err -> throwError $ PandocParsecError s err Right (Cm bls :: Cm () Blocks) -> return $ B.doc bls + +specFor :: (Monad m, Typeable m, Typeable a, + Rangeable (Cm a Inlines), Rangeable (Cm a Blocks)) + => ReaderOptions -> SyntaxSpec m (Cm a Inlines) (Cm a Blocks) +specFor opts = foldr ($) defaultSyntaxSpec exts where exts = [ (hardLineBreaksSpec <>) | isEnabled Ext_hard_line_breaks opts ] ++ [ (smartPunctuationSpec <>) | isEnabled Ext_smart opts ] ++ -- cgit v1.2.3 From a3eb87b2eab9def3e28364b43300043f5e13268d Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 9 Dec 2020 21:14:11 -0800 Subject: Add sourcepos extension for commonmarke * Add `Ext_sourcepos` constructor for `Extension`. * Add `sourcepos` extension (only for commonmark). * Bump to 2.11.3 With the `sourcepos` extension set set, `data-pos` attributes are added to the AST by the commonmark reader. No other readers are affected. The `data-pos` attributes are put on elements that accept attributes; for other elements, an enlosing Div or Span is added to hold the attributes. Closes #4565. --- MANUAL.txt | 7 +++++++ pandoc.cabal | 2 +- src/Text/Pandoc/Extensions.hs | 2 ++ src/Text/Pandoc/Options.hs | 1 + src/Text/Pandoc/Readers/CommonMark.hs | 14 +++++++++----- 5 files changed, 20 insertions(+), 6 deletions(-) (limited to 'src/Text') diff --git a/MANUAL.txt b/MANUAL.txt index 0a1300947..461ebf54d 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -5127,6 +5127,13 @@ for regular emphasis, add extra blank space around headings. [Project Gutenberg]: https://www.gutenberg.org +#### Extension: `sourcepos` #### + +Include source position attributes when parsing `commonmark`. +For elements that accept attributes, a `data-pos` attribute +is added; other elements are placed in a surrounding +Div or Span elemnet with a `data-pos` attribute. + ## Markdown variants In addition to pandoc's extended Markdown, the following Markdown diff --git a/pandoc.cabal b/pandoc.cabal index c02dfeb38..5829856da 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -1,6 +1,6 @@ cabal-version: 2.2 name: pandoc -version: 2.11.2 +version: 2.11.3 build-type: Simple license: GPL-2.0-or-later license-file: COPYING.md diff --git a/src/Text/Pandoc/Extensions.hs b/src/Text/Pandoc/Extensions.hs index a94e24e2c..9865f897b 100644 --- a/src/Text/Pandoc/Extensions.hs +++ b/src/Text/Pandoc/Extensions.hs @@ -154,6 +154,7 @@ data Extension = | Ext_yaml_metadata_block -- ^ YAML metadata block | Ext_gutenberg -- ^ Use Project Gutenberg conventions for plain | Ext_attributes -- ^ Generic attribute syntax + | Ext_sourcepos -- ^ Include source position attributes deriving (Show, Read, Enum, Eq, Ord, Bounded, Data, Typeable, Generic) -- | Extensions to be used with pandoc-flavored markdown. @@ -503,6 +504,7 @@ getAllExtensions f = universalExtensions <> getAll f , Ext_implicit_header_references , Ext_attributes , Ext_fenced_code_attributes + , Ext_sourcepos ] getAll "commonmark_x" = getAll "commonmark" getAll "org" = autoIdExtensions <> diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs index c7f1a56fa..ecd65a54d 100644 --- a/src/Text/Pandoc/Options.hs +++ b/src/Text/Pandoc/Options.hs @@ -65,6 +65,7 @@ data ReaderOptions = ReaderOptions{ , readerDefaultImageExtension :: Text -- ^ Default extension for images , readerTrackChanges :: TrackChanges -- ^ Track changes setting for docx , readerStripComments :: Bool -- ^ Strip HTML comments instead of parsing as raw HTML + -- (only implemented in commonmark) } deriving (Show, Read, Data, Typeable, Generic) instance HasSyntaxExtensions ReaderOptions where diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs index d32a38342..9eef498e1 100644 --- a/src/Text/Pandoc/Readers/CommonMark.hs +++ b/src/Text/Pandoc/Readers/CommonMark.hs @@ -32,11 +32,15 @@ import Data.Typeable -- | Parse a CommonMark formatted string into a 'Pandoc' structure. readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc -readCommonMark opts s = do - let res = runIdentity $ commonmarkWith (specFor opts) "" s - case res of - Left err -> throwError $ PandocParsecError s err - Right (Cm bls :: Cm () Blocks) -> return $ B.doc bls +readCommonMark opts s + | isEnabled Ext_sourcepos opts = + case runIdentity (commonmarkWith (specFor opts) "" s) of + Left err -> throwError $ PandocParsecError s err + Right (Cm bls :: Cm SourceRange Blocks) -> return $ B.doc bls + | otherwise = + case runIdentity (commonmarkWith (specFor opts) "" s) of + Left err -> throwError $ PandocParsecError s err + Right (Cm bls :: Cm () Blocks) -> return $ B.doc bls specFor :: (Monad m, Typeable m, Typeable a, Rangeable (Cm a Inlines), Rangeable (Cm a Blocks)) -- cgit v1.2.3 From 0a502e5ff52b251bbf3da69fd1f9a88d5e0fe92c Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 10 Dec 2020 15:44:10 -0800 Subject: HTML reader: retain attribute prefixes and avoid duplicates. Previously we stripped attribute prefixes, reading `xml:lang` as `lang` for example. This resulted in two duplicate `lang` attributes when `xml:lang` and `lang` were both used. This commit causes the prefixes to be retained, and also avoids invald duplicate attributes. Closes #6938. --- src/Text/Pandoc/Readers/HTML.hs | 28 +++++++++++----------------- src/Text/Pandoc/Readers/HTML/Parsing.hs | 20 +++++++++++++------- test/command/5986.md | 2 +- test/epub/wasteland.native | 8 ++++---- 4 files changed, 29 insertions(+), 29 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index eb78979a3..f870a241d 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -74,7 +74,7 @@ readHtml :: PandocMonad m -> Text -- ^ String to parse (assumes @'\n'@ line endings) -> m Pandoc readHtml opts inp = do - let tags = stripPrefixes . canonicalizeTags $ + let tags = stripPrefixes $ canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } (crFilter inp) parseDoc = do @@ -95,6 +95,15 @@ readHtml opts inp = do Right doc -> return doc Left err -> throwError $ PandocParseError $ T.pack $ getError err +-- Strip namespace prefixes on tags (not attributes) +stripPrefixes :: [Tag Text] -> [Tag Text] +stripPrefixes = map stripPrefix + +stripPrefix :: Tag Text -> Tag Text +stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as +stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s) +stripPrefix x = x + replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes bs = do st <- getState @@ -114,7 +123,7 @@ setInPlain = local (\s -> s {inPlain = True}) pHtml :: PandocMonad m => TagParser m Blocks pHtml = try $ do (TagOpen "html" attr) <- lookAhead pAny - for_ (lookup "lang" attr) $ + for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $ updateState . B.setMeta "lang" . B.text pInTags "html" block @@ -1024,21 +1033,6 @@ htmlTag f = try $ do handleTag tagname _ -> mzero --- Strip namespace prefixes -stripPrefixes :: [Tag Text] -> [Tag Text] -stripPrefixes = map stripPrefix - -stripPrefix :: Tag Text -> Tag Text -stripPrefix (TagOpen s as) = - TagOpen (stripPrefix' s) (map (first stripPrefix') as) -stripPrefix (TagClose s) = TagClose (stripPrefix' s) -stripPrefix x = x - -stripPrefix' :: Text -> Text -stripPrefix' s = - if T.null t then s else T.drop 1 t - where (_, t) = T.span (/= ':') s - -- Utilities -- | Adjusts a url according to the document's base URL. diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs index 2d58319da..e28ebe77b 100644 --- a/src/Text/Pandoc/Readers/HTML/Parsing.hs +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -193,14 +193,20 @@ t1 `closes` t2 | _ `closes` _ = False toStringAttr :: [(Text, Text)] -> [(Text, Text)] -toStringAttr = map go +toStringAttr = foldr go [] where - go (x,y) = - case T.stripPrefix "data-" x of - Just x' | x' `Set.notMember` (html5Attributes <> - html4Attributes <> rdfaAttributes) - -> (x',y) - _ -> (x,y) + go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)] + -- treat xml:lang as lang + go ("xml:lang",y) ats = go ("lang",y) ats + -- prevent duplicate attributes + go (x,y) ats + | any (\(x',_) -> x == x') ats = ats + | otherwise = + case T.stripPrefix "data-" x of + Just x' | x' `Set.notMember` (html5Attributes <> + html4Attributes <> rdfaAttributes) + -> go (x',y) ats + _ -> (x,y):ats -- Unlike fromAttrib from tagsoup, this distinguishes -- between a missing attribute and an attribute with empty content. diff --git a/test/command/5986.md b/test/command/5986.md index ea0ca70c1..ed8dd30c9 100644 --- a/test/command/5986.md +++ b/test/command/5986.md @@ -4,7 +4,7 @@ ^D

-