From 8c38390038edcebd55f9dec8359ef983f3813425 Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Fri, 27 Nov 2020 21:21:25 +0100 Subject: HTML reader tests: improve test coverage of new features --- src/Text/Pandoc/Readers/HTML/Table.hs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs index 5a783988f..91639fa4c 100644 --- a/src/Text/Pandoc/Readers/HTML/Table.hs +++ b/src/Text/Pandoc/Readers/HTML/Table.hs @@ -214,7 +214,8 @@ normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot -> Either String ([ColSpec], TableHead, [TableBody], TableFoot) normalize widths head' bodies foot = do let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot - let rowLength = length . rowCells + let cellWidth (Cell _ _ _ (ColSpan cs) _) = cs + let rowLength = foldr (\cell acc -> cellWidth cell + acc) 0 . rowCells let ncols = maximum (map rowLength rows) let tblType = tableType (map rowCells rows) -- fail on empty table -- cgit v1.2.3 From 83d63b72e1b9eff9f2aa3b9f36b56d348f0909a2 Mon Sep 17 00:00:00 2001 From: Tassos Manganaris Date: Fri, 27 Nov 2020 23:42:53 -0500 Subject: Fix a tiny Typo in the CSV reader module Header comment in the CSV reader module says "RST" instead of "CSV". --- src/Text/Pandoc/Readers/CSV.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/CSV.hs b/src/Text/Pandoc/Readers/CSV.hs index f0edcaa16..45f4d88d4 100644 --- a/src/Text/Pandoc/Readers/CSV.hs +++ b/src/Text/Pandoc/Readers/CSV.hs @@ -2,7 +2,7 @@ {-# LANGUAGE ScopedTypeVariables #-} {- | - Module : Text.Pandoc.Readers.RST + Module : Text.Pandoc.Readers.CSV Copyright : Copyright (C) 2006-2020 John MacFarlane License : GNU GPL, version 2 or above -- cgit v1.2.3 From bff9c129c3579f928a0067759d0a784eb5c07d30 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 29 Nov 2020 10:29:51 -0800 Subject: LaTeX reader: don't parse `\rule` with width 0 as horizontal rule. --- src/Text/Pandoc/Readers/LaTeX.hs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 2a9bff746..15a1a19fc 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1225,6 +1225,16 @@ preamble = mconcat <$> many preambleBlock anyTok return mempty) +rule :: PandocMonad m => LP m Blocks +rule = do + skipopts + width <- T.takeWhile (\c -> isDigit c || c == '.') . stringify <$> tok + _thickness <- tok + -- 0-width rules are used to fix spacing issues: + case safeRead width of + Just (0 :: Double) -> return mempty + _ -> return horizontalRule + paragraph :: PandocMonad m => LP m Blocks paragraph = do x <- trimInlines . mconcat <$> many1 inline @@ -1595,7 +1605,7 @@ blockCommands = M.fromList -- , ("hrule", pure horizontalRule) , ("strut", pure mempty) - , ("rule", skipopts *> tok *> tok $> horizontalRule) + , ("rule", rule) , ("item", looseItem) , ("documentclass", skipopts *> braced *> preamble) , ("centerline", para . trimInlines <$> (skipopts *> tok)) -- cgit v1.2.3 From ddb76cb356a82f6a9e51a6f3626dd154816e9205 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 5 Dec 2020 09:53:39 -0800 Subject: LaTeX reader: don't apply theorem default styling to a figure inside. If we put an image in italics, then when rendering to Markdown we no longer get an implicit figure. Closes #6925. --- src/Text/Pandoc/Readers/LaTeX.hs | 1 + test/command/6925.md | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 test/command/6925.md (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 15a1a19fc..afe960454 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1812,6 +1812,7 @@ theoremEnvironment name = do _ -> bs italicize :: Block -> Block +italicize x@(Para [Image{}]) = x -- see #6925 italicize (Para ils) = Para [Emph ils] italicize (Plain ils) = Plain [Emph ils] italicize x = x diff --git a/test/command/6925.md b/test/command/6925.md new file mode 100644 index 000000000..458a0b91d --- /dev/null +++ b/test/command/6925.md @@ -0,0 +1,34 @@ +``` +% pandoc -f latex -t markdown +\documentclass{amsart} +\newtheorem{thm}{Theorem}[section] +\theoremstyle{definition} +\newtheorem{thm2}[section]{Theorem} +\begin{document} +\begin{thm} +a +\begin{figure} +\includegraphics[]{1.png} +\end{figure} +\end{thm} + +\begin{thm2} +a +\begin{figure} +\includegraphics[]{1.png} +\end{figure} +\end{thm2} +\end{document} +^D +::: {.thm} +**Theorem 1**. *a* + +![image](1.png) +::: + +::: {.thm2} +**Theorem 1**. a + +![image](1.png) +::: +``` -- cgit v1.2.3 From acf932825bfe40d9a18046c9d304f4f14363a88a Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Sat, 5 Dec 2020 22:05:37 +0100 Subject: Org reader: preserve targets of spurious links Links with (internal) targets that the reader doesn't know about are converted into emphasized text. Information on the link target is now preserved by wrapping the text in a Span of class `spurious-link`, with an attribute `target` set to the link's original target. This allows to recover and fix broken or unknown links with filters. See: #6916 --- src/Text/Pandoc/Readers/Org/Inlines.hs | 9 ++++----- test/Tests/Readers/Org/Meta.hs | 6 ++++-- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/Org/Inlines.hs b/src/Text/Pandoc/Readers/Org/Inlines.hs index b234bee58..0330cf55f 100644 --- a/src/Text/Pandoc/Readers/Org/Inlines.hs +++ b/src/Text/Pandoc/Readers/Org/Inlines.hs @@ -477,17 +477,17 @@ linkToInlinesF linkStr = internalLink :: Text -> Inlines -> F Inlines internalLink link title = do - anchorB <- (link `elem`) <$> asksF orgStateAnchorIds - if anchorB + ids <- asksF orgStateAnchorIds + if link `elem` ids then return $ B.link ("#" <> link) "" title - else return $ B.emph title + else let attr' = ("", ["spurious-link"] , [("target", link)]) + in return $ B.spanWith attr' (B.emph title) -- | Parse an anchor like @<>@ and return an empty span with -- @anchor-id@ set as id. Legal anchors in org-mode are defined through -- @org-target-regexp@, which is fairly liberal. Since no link is created if -- @anchor-id@ contains spaces, we are more restrictive in what is accepted as -- an anchor. - anchor :: PandocMonad m => OrgParser m (F Inlines) anchor = try $ do anchorId <- parseAnchor @@ -501,7 +501,6 @@ anchor = try $ do -- | Replace every char but [a-zA-Z0-9_.-:] with a hyphen '-'. This mirrors -- the org function @org-export-solidify-link-text@. - solidify :: Text -> Text solidify = T.map replaceSpecialChar where replaceSpecialChar c diff --git a/test/Tests/Readers/Org/Meta.hs b/test/Tests/Readers/Org/Meta.hs index 041016f64..bc167f2a5 100644 --- a/test/Tests/Readers/Org/Meta.hs +++ b/test/Tests/Readers/Org/Meta.hs @@ -270,7 +270,8 @@ tests = , "Search links are read as emph" =: "[[Wally][Where's Wally?]]" =?> - para (emph $ "Where's" <> space <> "Wally?") + para (spanWith ("", ["spurious-link"], [("target", "Wally")]) + (emph $ "Where's" <> space <> "Wally?")) , "Link to nonexistent anchor" =: T.unlines [ "<> Target." @@ -278,5 +279,6 @@ tests = , "[[link$here][See here!]]" ] =?> (para (spanWith ("link-here", [], []) mempty <> "Target.") <> - para (emph ("See" <> space <> "here!"))) + para (spanWith ("", ["spurious-link"], [("target", "link$here")]) + (emph ("See" <> space <> "here!")))) ] -- cgit v1.2.3 From 501ea7f0c4735acdf1457da44fe04d811ac776d7 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 7 Dec 2020 12:15:14 -0800 Subject: Dokuwiki reader: handle unknown interwiki links better. DokuWiki lets the user define his own Interwiki links. Previously pandoc reacted to these by emitting a google search link, which is not helpful. Instead, we now just emit the full URL including the wikilink prefix, e.g. `faquk>FAQ-mathml`. This at least gives users the ability to modify the links using filters. Closes #6932. --- src/Text/Pandoc/Readers/DokuWiki.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/DokuWiki.hs b/src/Text/Pandoc/Readers/DokuWiki.hs index 336be09e5..dedc1f03f 100644 --- a/src/Text/Pandoc/Readers/DokuWiki.hs +++ b/src/Text/Pandoc/Readers/DokuWiki.hs @@ -317,7 +317,7 @@ interwikiToUrl "wpes" page = "https://es.wikipedia.org/wiki/" <> page interwikiToUrl "wpfr" page = "https://fr.wikipedia.org/wiki/" <> page interwikiToUrl "wpjp" page = "https://jp.wikipedia.org/wiki/" <> page interwikiToUrl "wppl" page = "https://pl.wikipedia.org/wiki/" <> page -interwikiToUrl _ page = "https://www.google.com/search?q=" <> page <> "&btnI=lucky" +interwikiToUrl unknown page = unknown <> ">" <> page linkText :: PandocMonad m => DWParser m B.Inlines linkText = parseLink fromRaw "[[" "]]" -- cgit v1.2.3 From 8c9010864cd818031d7eff161a57459709751517 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 9 Dec 2020 21:05:40 -0800 Subject: Commonmark reader: refactor specFor, set input name to "". --- src/Text/Pandoc/Readers/CommonMark.hs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs index c1773eaab..d32a38342 100644 --- a/src/Text/Pandoc/Readers/CommonMark.hs +++ b/src/Text/Pandoc/Readers/CommonMark.hs @@ -1,4 +1,5 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE ScopedTypeVariables #-} {- | Module : Text.Pandoc.Readers.CommonMark @@ -27,15 +28,20 @@ import Text.Pandoc.Options import Text.Pandoc.Error import Control.Monad.Except import Data.Functor.Identity (runIdentity) +import Data.Typeable -- | Parse a CommonMark formatted string into a 'Pandoc' structure. readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc readCommonMark opts s = do - let res = runIdentity $ - commonmarkWith (foldr ($) defaultSyntaxSpec exts) "input" s + let res = runIdentity $ commonmarkWith (specFor opts) "" s case res of Left err -> throwError $ PandocParsecError s err Right (Cm bls :: Cm () Blocks) -> return $ B.doc bls + +specFor :: (Monad m, Typeable m, Typeable a, + Rangeable (Cm a Inlines), Rangeable (Cm a Blocks)) + => ReaderOptions -> SyntaxSpec m (Cm a Inlines) (Cm a Blocks) +specFor opts = foldr ($) defaultSyntaxSpec exts where exts = [ (hardLineBreaksSpec <>) | isEnabled Ext_hard_line_breaks opts ] ++ [ (smartPunctuationSpec <>) | isEnabled Ext_smart opts ] ++ -- cgit v1.2.3 From a3eb87b2eab9def3e28364b43300043f5e13268d Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 9 Dec 2020 21:14:11 -0800 Subject: Add sourcepos extension for commonmarke * Add `Ext_sourcepos` constructor for `Extension`. * Add `sourcepos` extension (only for commonmark). * Bump to 2.11.3 With the `sourcepos` extension set set, `data-pos` attributes are added to the AST by the commonmark reader. No other readers are affected. The `data-pos` attributes are put on elements that accept attributes; for other elements, an enlosing Div or Span is added to hold the attributes. Closes #4565. --- MANUAL.txt | 7 +++++++ pandoc.cabal | 2 +- src/Text/Pandoc/Extensions.hs | 2 ++ src/Text/Pandoc/Options.hs | 1 + src/Text/Pandoc/Readers/CommonMark.hs | 14 +++++++++----- 5 files changed, 20 insertions(+), 6 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/MANUAL.txt b/MANUAL.txt index 0a1300947..461ebf54d 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -5127,6 +5127,13 @@ for regular emphasis, add extra blank space around headings. [Project Gutenberg]: https://www.gutenberg.org +#### Extension: `sourcepos` #### + +Include source position attributes when parsing `commonmark`. +For elements that accept attributes, a `data-pos` attribute +is added; other elements are placed in a surrounding +Div or Span elemnet with a `data-pos` attribute. + ## Markdown variants In addition to pandoc's extended Markdown, the following Markdown diff --git a/pandoc.cabal b/pandoc.cabal index c02dfeb38..5829856da 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -1,6 +1,6 @@ cabal-version: 2.2 name: pandoc -version: 2.11.2 +version: 2.11.3 build-type: Simple license: GPL-2.0-or-later license-file: COPYING.md diff --git a/src/Text/Pandoc/Extensions.hs b/src/Text/Pandoc/Extensions.hs index a94e24e2c..9865f897b 100644 --- a/src/Text/Pandoc/Extensions.hs +++ b/src/Text/Pandoc/Extensions.hs @@ -154,6 +154,7 @@ data Extension = | Ext_yaml_metadata_block -- ^ YAML metadata block | Ext_gutenberg -- ^ Use Project Gutenberg conventions for plain | Ext_attributes -- ^ Generic attribute syntax + | Ext_sourcepos -- ^ Include source position attributes deriving (Show, Read, Enum, Eq, Ord, Bounded, Data, Typeable, Generic) -- | Extensions to be used with pandoc-flavored markdown. @@ -503,6 +504,7 @@ getAllExtensions f = universalExtensions <> getAll f , Ext_implicit_header_references , Ext_attributes , Ext_fenced_code_attributes + , Ext_sourcepos ] getAll "commonmark_x" = getAll "commonmark" getAll "org" = autoIdExtensions <> diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs index c7f1a56fa..ecd65a54d 100644 --- a/src/Text/Pandoc/Options.hs +++ b/src/Text/Pandoc/Options.hs @@ -65,6 +65,7 @@ data ReaderOptions = ReaderOptions{ , readerDefaultImageExtension :: Text -- ^ Default extension for images , readerTrackChanges :: TrackChanges -- ^ Track changes setting for docx , readerStripComments :: Bool -- ^ Strip HTML comments instead of parsing as raw HTML + -- (only implemented in commonmark) } deriving (Show, Read, Data, Typeable, Generic) instance HasSyntaxExtensions ReaderOptions where diff --git a/src/Text/Pandoc/Readers/CommonMark.hs b/src/Text/Pandoc/Readers/CommonMark.hs index d32a38342..9eef498e1 100644 --- a/src/Text/Pandoc/Readers/CommonMark.hs +++ b/src/Text/Pandoc/Readers/CommonMark.hs @@ -32,11 +32,15 @@ import Data.Typeable -- | Parse a CommonMark formatted string into a 'Pandoc' structure. readCommonMark :: PandocMonad m => ReaderOptions -> Text -> m Pandoc -readCommonMark opts s = do - let res = runIdentity $ commonmarkWith (specFor opts) "" s - case res of - Left err -> throwError $ PandocParsecError s err - Right (Cm bls :: Cm () Blocks) -> return $ B.doc bls +readCommonMark opts s + | isEnabled Ext_sourcepos opts = + case runIdentity (commonmarkWith (specFor opts) "" s) of + Left err -> throwError $ PandocParsecError s err + Right (Cm bls :: Cm SourceRange Blocks) -> return $ B.doc bls + | otherwise = + case runIdentity (commonmarkWith (specFor opts) "" s) of + Left err -> throwError $ PandocParsecError s err + Right (Cm bls :: Cm () Blocks) -> return $ B.doc bls specFor :: (Monad m, Typeable m, Typeable a, Rangeable (Cm a Inlines), Rangeable (Cm a Blocks)) -- cgit v1.2.3 From 0a502e5ff52b251bbf3da69fd1f9a88d5e0fe92c Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 10 Dec 2020 15:44:10 -0800 Subject: HTML reader: retain attribute prefixes and avoid duplicates. Previously we stripped attribute prefixes, reading `xml:lang` as `lang` for example. This resulted in two duplicate `lang` attributes when `xml:lang` and `lang` were both used. This commit causes the prefixes to be retained, and also avoids invald duplicate attributes. Closes #6938. --- src/Text/Pandoc/Readers/HTML.hs | 28 +++++++++++----------------- src/Text/Pandoc/Readers/HTML/Parsing.hs | 20 +++++++++++++------- test/command/5986.md | 2 +- test/epub/wasteland.native | 8 ++++---- 4 files changed, 29 insertions(+), 29 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index eb78979a3..f870a241d 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -74,7 +74,7 @@ readHtml :: PandocMonad m -> Text -- ^ String to parse (assumes @'\n'@ line endings) -> m Pandoc readHtml opts inp = do - let tags = stripPrefixes . canonicalizeTags $ + let tags = stripPrefixes $ canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } (crFilter inp) parseDoc = do @@ -95,6 +95,15 @@ readHtml opts inp = do Right doc -> return doc Left err -> throwError $ PandocParseError $ T.pack $ getError err +-- Strip namespace prefixes on tags (not attributes) +stripPrefixes :: [Tag Text] -> [Tag Text] +stripPrefixes = map stripPrefix + +stripPrefix :: Tag Text -> Tag Text +stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as +stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s) +stripPrefix x = x + replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes bs = do st <- getState @@ -114,7 +123,7 @@ setInPlain = local (\s -> s {inPlain = True}) pHtml :: PandocMonad m => TagParser m Blocks pHtml = try $ do (TagOpen "html" attr) <- lookAhead pAny - for_ (lookup "lang" attr) $ + for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $ updateState . B.setMeta "lang" . B.text pInTags "html" block @@ -1024,21 +1033,6 @@ htmlTag f = try $ do handleTag tagname _ -> mzero --- Strip namespace prefixes -stripPrefixes :: [Tag Text] -> [Tag Text] -stripPrefixes = map stripPrefix - -stripPrefix :: Tag Text -> Tag Text -stripPrefix (TagOpen s as) = - TagOpen (stripPrefix' s) (map (first stripPrefix') as) -stripPrefix (TagClose s) = TagClose (stripPrefix' s) -stripPrefix x = x - -stripPrefix' :: Text -> Text -stripPrefix' s = - if T.null t then s else T.drop 1 t - where (_, t) = T.span (/= ':') s - -- Utilities -- | Adjusts a url according to the document's base URL. diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs index 2d58319da..e28ebe77b 100644 --- a/src/Text/Pandoc/Readers/HTML/Parsing.hs +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -193,14 +193,20 @@ t1 `closes` t2 | _ `closes` _ = False toStringAttr :: [(Text, Text)] -> [(Text, Text)] -toStringAttr = map go +toStringAttr = foldr go [] where - go (x,y) = - case T.stripPrefix "data-" x of - Just x' | x' `Set.notMember` (html5Attributes <> - html4Attributes <> rdfaAttributes) - -> (x',y) - _ -> (x,y) + go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)] + -- treat xml:lang as lang + go ("xml:lang",y) ats = go ("lang",y) ats + -- prevent duplicate attributes + go (x,y) ats + | any (\(x',_) -> x == x') ats = ats + | otherwise = + case T.stripPrefix "data-" x of + Just x' | x' `Set.notMember` (html5Attributes <> + html4Attributes <> rdfaAttributes) + -> go (x',y) ats + _ -> (x,y):ats -- Unlike fromAttrib from tagsoup, this distinguishes -- between a missing attribute and an attribute with empty content. diff --git a/test/command/5986.md b/test/command/5986.md index ea0ca70c1..ed8dd30c9 100644 --- a/test/command/5986.md +++ b/test/command/5986.md @@ -4,7 +4,7 @@ ^D

-