From 464abdc854b21b713e68335e07f2331a623acc97 Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Sun, 21 Nov 2021 10:15:35 +0100 Subject: Rename doc/custom-reader.lua to doc/custom-readers.md --- doc/custom-readers.md | 599 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 599 insertions(+) create mode 100644 doc/custom-readers.md (limited to 'doc/custom-readers.md') diff --git a/doc/custom-readers.md b/doc/custom-readers.md new file mode 100644 index 000000000..bef3ae050 --- /dev/null +++ b/doc/custom-readers.md @@ -0,0 +1,599 @@ +--- +author: +- John MacFarlane +date: 'November 18, 2021' +title: Creating Custom Pandoc Readers in Lua +--- + +# Introduction + +If you need to parse a format not already handled by pandoc, +you can create a custom reader using the [Lua] language. +Pandoc has a built-in Lua interpreter, so you needn't +install any additional software to do this. + +[Lua]: https://www.lua.org + +A custom writer is a Lua file that defines a function +called `Reader`, which takes two arguments: + +- a string, the raw input to be parsed +- optionally, a table of reader options, e.g. + `{ columns = 62, standalone = true }`. + +The `Reader` function should return a `Pandoc` AST. +This can be created using functions in the [`pandoc` module], +which is automatically in scope. (Indeed, all of the utility +functions that are available for [Lua filters] are available +in custom readers, too.) + +[Lua filters]: https://pandoc.org/lua-filters.html +[`pandoc` module]: https://pandoc.org/lua-filters.html#module-pandoc + +A minimal example would be + +```lua +function Reader(input) + return pandoc.Pandoc({ pandoc.CodeBlock(input) }) +end +``` + +This just returns a document containing a big code block with +all of the input. + +In a nontrivial reader, you'll want to parse the input. +You can do this using standard Lua library functions +(for example, the [patterns] library), or with the powerful +and fast [lpeg] parsing library, which is automatically in scope. +You can also use external Lua libraries (for example, +an XML parser). + +[patterns]: http://lua-users.org/wiki/PatternsTutorial +[lpeg]: http://www.inf.puc-rio.br/~roberto/lpeg/ + +# Example: plain text reader + +This is a simple example using [lpeg] to parse the input +into space-separated strings and blankline-separated paragraphs. + +```lua +-- A sample custom reader that just parses text into blankline-separated +-- paragraphs with space-separated words. + +-- For better performance we put these functions in local variables: +local P, S, R, Cf, Cc, Ct, V, Cs, Cg, Cb, B, C, Cmt = + lpeg.P, lpeg.S, lpeg.R, lpeg.Cf, lpeg.Cc, lpeg.Ct, lpeg.V, + lpeg.Cs, lpeg.Cg, lpeg.Cb, lpeg.B, lpeg.C, lpeg.Cmt + +local whitespacechar = S(" \t\r\n") +local wordchar = (1 - whitespacechar) +local spacechar = S(" \t") +local newline = P"\r"^-1 * P"\n" +local blanklines = newline * (spacechar^0 * newline)^1 +local endline = newline - blanklines + +-- Grammar +G = P{ "Pandoc", + Pandoc = Ct(V"Block"^0) / pandoc.Pandoc; + Block = blanklines^0 * V"Para" ; + Para = Ct(V"Inline"^1) / pandoc.Para; + Inline = V"Str" + V"Space" + V"SoftBreak" ; + Str = wordchar^1 / pandoc.Str; + Space = spacechar^1 / pandoc.Space; + SoftBreak = endline / pandoc.SoftBreak; +} + +function Reader(input) + return lpeg.match(G, input) +end +``` + +Example of use: + +``` +% pandoc -f plain.lua -t native +*Hello there*, this is plain text with no formatting +except paragraph breaks. + +- Like this one. +^D +[ Para + [ Str "*Hello" + , Space + , Str "there*," + , Space + , Str "this" + , Space + , Str "is" + , Space + , Str "plain" + , Space + , Str "text" + , Space + , Str "with" + , Space + , Str "no" + , Space + , Str "formatting" + , SoftBreak + , Str "except" + , Space + , Str "paragraph" + , Space + , Str "breaks." + ] +, Para + [ Str "-" + , Space + , Str "Like" + , Space + , Str "this" + , Space + , Str "one." + ] +] +``` + +# Example: a RIS bibliography reader + +This is a parser for [RIS bibliography] files. It can be used +to convert them to CSL JSON or YAML, BibTeX, or BibLaTeX. + +[RIS bibliography]: https://en.wikipedia.org/wiki/RIS_(file_format) + +```lua +-- A sample custom reader for RIS bibliography format +-- https://en.wikipedia.org/wiki/RIS_(file_format) +-- The references are converted to inline pandoc/CSL YAML +-- references in the metadata. + +local inspect = require"inspect" + +local types = + { ABST = "article", + ADVS = "motion-picture", + AGGR = "dataset", + ANCIENT = "book", + ART = "graphic", + BILL = "bill", + BLOG = "post-weblog", + BOOK = "book", + CASE = "legal_case", + CHAP = "chapter", + CHART = "graphic", + CLSWK = "book", + COMP = "program", + CONF = "paper-conference", + CPAPER = "paper-conference", + CTLG = "catalog", + DATA = "dataset", + DBASE = "dataset", + DICT = "book", + EBOOK = "book", + ECHAP = "chapter", + EDBOOK = "book", + EJOUR = "article", + WEB = "webpage", + ENCYC = "entry-encyclopedia", + EQUA = "figure", + FIGURE = "figure", + GEN = "entry", + GOVDOC = "report", + GRANT = "report", + HEAR = "report", + ICOMM = "personal_communication", + INPR = "article-journal", + JFULL = "article-journal", + JOUR = "article-journal", + LEGAL = "legal_case", + MANSCPT = "manuscript", + MAP = "map", + MGZN = "article-magazine", + MPCT = "motion-picture", + MULTI = "webpage", + MUSIC = "musical_score", + NEWS = "article-newspaper", + PAMP = "pamphlet", + PAT = "patent", + PCOMM = "personal_communication", + RPRT = "report", + SER = "article", + SLIDE = "graphic", + SOUND = "musical_score", + STAND = "report", + STAT = "legislation", + THES = "thesis", + UNBILL = "bill", + UNPB = "unpublished", + VIDEO = "graphic" + } + +local function inlines(s) + local ils = {} + for t in string.gmatch(s, "%S+") do + if #ils == 0 then + ils = {pandoc.Str(t)} + else + table.insert(ils, pandoc.Space()) + table.insert(ils, pandoc.Str(t)) + end + end + return pandoc.MetaInlines(ils) +end + +local function clean(refpairs) + local ref = {} + for i = 1, #refpairs do + local k,v = table.unpack(refpairs[i]) + if k == "TY" then + ref["type"] = types[v] + elseif k == "VL" then + ref.volume = v + elseif k == "KW" then + ref.keyword = v + elseif k == "PB" then + ref.publisher = v + elseif k == "CY" or k == "PP" then + ref["publisher-place"] = v + elseif k == "SP" then + if ref.page then + ref.page = v .. ref.page + else + ref.page = v + end + elseif k == "EP" then + if ref.page then + ref.page = ref.page .. "-" .. v + else + ref.page = "-" .. v + end + elseif k == "AU" or k == "A1" or k == "A2" or k == "A3" then + if ref.author then + table.insert(ref.author, inlines(v)) + else + ref.author = {inlines(v)} + end + elseif k == "TI" or k == "T1" or k == "CT" or + (k == "BT" and ref.type == "book") then + ref.title = inlines(v) + elseif k == "ET" then + ref.edition = inlines(v) + elseif k == "NV" then + ref["number-of-volumes"] = inlines(v) + elseif k == "AB" then + ref.abstract = inlines(v) + elseif k == "ED" then + if ref.editor then + table.insert(ref.editor, inlines(v)) + else + ref.editor = {inlines(v)} + end + elseif k == "JO" or k == "JF" or k == "T2" or + (k == "BT" and ref.type ~= "book") then + ref["container-title"] = inlines(v) + elseif k == "PY" or k == "Y1" then + ref.issued = v + elseif k == "IS" then + ref.issue = v + elseif k == "SN" then + ref.ISSN = v + elseif k == "L" then + ref.lang = v + elseif k == "UR" or k == "LK" then + ref.URL = v + end + end + return ref +end + +function Reader(input, reader_options) + local refs = {} + local thisref = {} + local ids = {} + for line in string.gmatch(input, "[^\n]*") do + key, val = string.match(line, "([A-Z][A-Z0-9]) %- (.*)") + if key == "ER" then + -- clean up fields + local newref = clean(thisref) + -- ensure we have an id and if not, create a sensible one + if not newref.id then + newref.id = "" + for _,x in ipairs(newref.author) do + newref.id = newref.id .. string.match(pandoc.utils.stringify(x), "%a+") + end + if newref.issued then + newref.id = newref.id .. string.match(newref.issued, "%d+") + end + if ids[newref.id] then -- add disambiguator if needed + newref.id = newref.id .. "-" .. #ids + end + end + table.insert(ids, newref.id) + table.insert(refs, newref) + thisref = {} + elseif key then + table.insert(thisref, {key, val}) + end + end + return pandoc.Pandoc({}, pandoc.Meta { references = refs } ) +end +``` + +Example of use: + +``` +% pandoc -f ris.lua -t bibtex +TY - JOUR +AU - Shannon, Claude E. +PY - 1948 +DA - July +TI - A Mathematical Theory of Communication +T2 - Bell System Technical Journal +SP - 379 +EP - 423 +VL - 27 +ER - +TY - JOUR +T1 - On computable numbers, with an application to the Entscheidungsproblem +A1 - Turing, Alan Mathison +JO - Proc. of London Mathematical Society +VL - 47 +IS - 1 +SP - 230 +EP - 265 +Y1 - 1937 +ER - +^D +@article{Shannon1948, + author = {Shannon, Claude E.}, + title = {A {Mathematical} {Theory} of {Communication}}, + journal = {Bell System Technical Journal}, + volume = {27}, + pages = {379-423}, + year = {1948} +} +@article{Turing1937, + author = {Turing, Alan Mathison}, + title = {On Computable Numbers, with an Application to the + {Entscheidungsproblem}}, + journal = {Proc. of London Mathematical Society}, + volume = {47}, + number = {1}, + pages = {230-265}, + year = {1937} +} +``` + +# Example: a wiki Creole reader + +This is a parser for [Creole common wiki markup]. +It uses an [lpeg] grammar. Fun fact: this custom reader is faster than +pandoc's built-in creole reader! This shows that high-performance +readers can be designed in this way. + +[Creole common wiki markup]: http://www.wikicreole.org/wiki/CheatSheet + + +```lua +-- A sample custom reader for Creole 1.0 (common wiki markup) +-- http://www.wikicreole.org/wiki/CheatSheet + +-- For better performance we put these functions in local variables: +local P, S, R, Cf, Cc, Ct, V, Cs, Cg, Cb, B, C, Cmt = + lpeg.P, lpeg.S, lpeg.R, lpeg.Cf, lpeg.Cc, lpeg.Ct, lpeg.V, + lpeg.Cs, lpeg.Cg, lpeg.Cb, lpeg.B, lpeg.C, lpeg.Cmt + +local whitespacechar = S(" \t\r\n") +local specialchar = S("/*~[]\\{}|") +local wordchar = (1 - (whitespacechar + specialchar)) +local spacechar = S(" \t") +local newline = P"\r"^-1 * P"\n" +local blankline = spacechar^0 * newline +local endline = newline * #-blankline +local endequals = spacechar^0 * P"="^0 * spacechar^0 * newline +local cellsep = spacechar^0 * P"|" + +local function trim(s) + return (s:gsub("^%s*(.-)%s*$", "%1")) +end + +local function ListItem(lev, ch) + local start + if ch == nil then + start = S"*#" + else + start = P(ch) + end + local subitem = function(c) + if lev < 6 then + return ListItem(lev + 1, c) + else + return (1 - 1) -- fails + end + end + local parser = spacechar^0 + * start^lev + * #(- start) + * spacechar^0 + * Ct((V"Inline" - (newline * spacechar^0 * S"*#"))^0) + * newline + * (Ct(subitem("*")^1) / pandoc.BulletList + + + Ct(subitem("#")^1) / pandoc.OrderedList + + + Cc(nil)) + / function (ils, sublist) + return { pandoc.Plain(ils), sublist } + end + return parser +end + +-- Grammar +G = P{ "Doc", + Doc = Ct(V"Block"^0) + / pandoc.Pandoc ; + Block = blankline^0 + * ( V"Header" + + V"HorizontalRule" + + V"CodeBlock" + + V"List" + + V"Table" + + V"Para") ; + Para = Ct(V"Inline"^1) + * newline + / pandoc.Para ; + HorizontalRule = spacechar^0 + * P"----" + * spacechar^0 + * newline + / pandoc.HorizontalRule; + Header = (P("=")^1 / string.len) + * spacechar^1 + * Ct((V"Inline" - endequals)^1) + * endequals + / pandoc.Header; + CodeBlock = P"{{{" + * blankline + * C((1 - (newline * P"}}}"))^0) + * newline + * P"}}}" + / pandoc.CodeBlock; + Placeholder = P"<<<" + * C(P(1) - P">>>")^0 + * P">>>" + / function() return pandoc.Div({}) end; + List = V"BulletList" + + V"OrderedList" ; + BulletList = Ct(ListItem(1,'*')^1) + / pandoc.BulletList ; + OrderedList = Ct(ListItem(1,'#')^1) + / pandoc.OrderedList ; + Table = (V"TableHeader" + Cc{}) + * Ct(V"TableRow"^1) + / function(headrow, bodyrows) + local numcolumns = #(bodyrows[1]) + local aligns = {} + local widths = {} + for i = 1,numcolumns do + aligns[i] = pandoc.AlignDefault + widths[i] = 0 + end + return pandoc.utils.from_simple_table( + pandoc.SimpleTable({}, aligns, widths, headrow, bodyrows)) + end ; + TableHeader = Ct(V"HeaderCell"^1) + * cellsep^-1 + * spacechar^0 + * newline ; + TableRow = Ct(V"BodyCell"^1) + * cellsep^-1 + * spacechar^0 + * newline ; + HeaderCell = cellsep + * P"=" + * spacechar^0 + * Ct((V"Inline" - (newline + cellsep))^0) + / function(ils) return { pandoc.Plain(ils) } end ; + BodyCell = cellsep + * spacechar^0 + * Ct((V"Inline" - (newline + cellsep))^0) + / function(ils) return { pandoc.Plain(ils) } end ; + Inline = V"Emph" + + V"Strong" + + V"LineBreak" + + V"Link" + + V"URL" + + V"Image" + + V"Str" + + V"Space" + + V"SoftBreak" + + V"Escaped" + + V"Placeholder" + + V"Code" + + V"Special" ; + Str = wordchar^1 + / pandoc.Str; + Escaped = P"~" + * C(P(1)) + / pandoc.Str ; + Special = specialchar + / pandoc.Str; + Space = spacechar^1 + / pandoc.Space ; + SoftBreak = endline + * # -(V"HorizontalRule" + V"CodeBlock") + / pandoc.SoftBreak ; + LineBreak = P"\\\\" + / pandoc.LineBreak ; + Code = P"{{{" + * C((1 - P"}}}")^0) + * P"}}}" + / trim / pandoc.Code ; + Link = P"[[" + * C((1 - (P"]]" + P"|"))^0) + * (P"|" * Ct((V"Inline" - P"]]")^1))^-1 * P"]]" + / function(url, desc) + local txt = desc or {pandoc.Str(url)} + return pandoc.Link(txt, url) + end ; + Image = P"{{" + * #-P"{" + * C((1 - (S"}"))^0) + * (P"|" * Ct((V"Inline" - P"}}")^1))^-1 + * P"}}" + / function(url, desc) + local txt = desc or "" + return pandoc.Image(txt, url) + end ; + URL = P"http" + * P"s"^-1 + * P":" + * (1 - (whitespacechar + (S",.?!:;\"'" * #whitespacechar)))^1 + / function(url) + return pandoc.Link(pandoc.Str(url), url) + end ; + Emph = P"//" + * Ct((V"Inline" - P"//")^1) + * P"//" + / pandoc.Emph ; + Strong = P"**" + * Ct((V"Inline" -P"**")^1) + * P"**" + / pandoc.Strong ; +} + +function Reader(input, reader_options) + return lpeg.match(G, input) +end +``` + +Example of use: + +``` +% pandoc -f creole.lua -t markdown +== Wiki Creole + +You can make things **bold** or //italic// or **//both//** or //**both**//. + +Character formatting extends across line breaks: **bold, +this is still bold. This line deliberately does not end in star-star. + +Not bold. Character formatting does not cross paragraph boundaries. + +You can use [[internal links]] or [[http://www.wikicreole.org|external links]], +give the link a [[internal links|different]] name. +^D +## Wiki Creole + +You can make things **bold** or *italic* or ***both*** or ***both***. + +Character formatting extends across line breaks: \*\*bold, this is still +bold. This line deliberately does not end in star-star. + +Not bold. Character formatting does not cross paragraph boundaries. + +You can use [internal links](internal links) or [external +links](http://www.wikicreole.org), give the link a +[different](internal links) name. +``` + -- cgit v1.2.3 From a039f024f2c0e2d9e6d1ae7710cf96debcbc5e43 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 22 Nov 2021 14:58:18 -0800 Subject: Add an example to custom-readers.md. --- doc/custom-readers.md | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'doc/custom-readers.md') diff --git a/doc/custom-readers.md b/doc/custom-readers.md index bef3ae050..fe032d4b3 100644 --- a/doc/custom-readers.md +++ b/doc/custom-readers.md @@ -597,3 +597,55 @@ links](http://www.wikicreole.org), give the link a [different](internal links) name. ``` +# Example: parsing JSON from an API + +This custom reader consumes the JSON output of + and produces +a document containing the current top articles on the +Haskell subreddit. + +It assumes that the `luajson` library is available. (It can be +installed using `luarocks install luajson`---but be sure you are +installing it for Lua 5.3, which is the version packaged with +pandoc.) + + +```lua +-- consumes the output of https://www.reddit.com/r/haskell.json + +local json = require'json' -- luajson must be available + +local function read_inlines(raw) + local doc = pandoc.read(raw, "commonmark") + return pandoc.utils.blocks_to_inlines(doc.blocks) +end + +local function read_blocks(raw) + local doc = pandoc.read(raw, "commonmark") + return doc.blocks +end + +function Reader(input) + + local parsed = json.decode(input) + local blocks = {} + + for _,entry in ipairs(parsed.data.children) do + local d = entry.data + table.insert(blocks, pandoc.Header(2, + pandoc.Link(read_inlines(d.title), d.url))) + for _,block in ipairs(read_blocks(d.selftext)) do + table.insert(blocks, block) + end + end + + return pandoc.Pandoc(blocks) + +end +``` + +Similar code can be used to consume JSON output from other APIs. + +Note that the content of the text fields is markdown, so we +convert it using `pandoc.read()`. + -- cgit v1.2.3 From b72ba3ed6dbf6de7ee23c8f5148648b599b49964 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Tue, 23 Nov 2021 09:57:09 -0800 Subject: Simplify sample ris reader in doc/custom-readers.md. Made possible by #7712. --- doc/custom-readers.md | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) (limited to 'doc/custom-readers.md') diff --git a/doc/custom-readers.md b/doc/custom-readers.md index fe032d4b3..c85b43ae4 100644 --- a/doc/custom-readers.md +++ b/doc/custom-readers.md @@ -208,19 +208,6 @@ local types = VIDEO = "graphic" } -local function inlines(s) - local ils = {} - for t in string.gmatch(s, "%S+") do - if #ils == 0 then - ils = {pandoc.Str(t)} - else - table.insert(ils, pandoc.Space()) - table.insert(ils, pandoc.Str(t)) - end - end - return pandoc.MetaInlines(ils) -end - local function clean(refpairs) local ref = {} for i = 1, #refpairs do @@ -249,28 +236,28 @@ local function clean(refpairs) end elseif k == "AU" or k == "A1" or k == "A2" or k == "A3" then if ref.author then - table.insert(ref.author, inlines(v)) + table.insert(ref.author, v) else - ref.author = {inlines(v)} + ref.author = {v} end elseif k == "TI" or k == "T1" or k == "CT" or (k == "BT" and ref.type == "book") then - ref.title = inlines(v) + ref.title = v elseif k == "ET" then - ref.edition = inlines(v) + ref.edition = v elseif k == "NV" then - ref["number-of-volumes"] = inlines(v) + ref["number-of-volumes"] = v elseif k == "AB" then - ref.abstract = inlines(v) + ref.abstract = v elseif k == "ED" then if ref.editor then - table.insert(ref.editor, inlines(v)) + table.insert(ref.editor, v) else - ref.editor = {inlines(v)} + ref.editor = {v} end elseif k == "JO" or k == "JF" or k == "T2" or (k == "BT" and ref.type ~= "book") then - ref["container-title"] = inlines(v) + ref["container-title"] = v elseif k == "PY" or k == "Y1" then ref.issued = v elseif k == "IS" then -- cgit v1.2.3 From 136739b7ed6dbd4052be582febf4ff3beea32d87 Mon Sep 17 00:00:00 2001 From: Mauro Bieg Date: Mon, 29 Nov 2021 17:59:37 +0100 Subject: fix typo in custom-readers.md (#7722) --- doc/custom-readers.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'doc/custom-readers.md') diff --git a/doc/custom-readers.md b/doc/custom-readers.md index c85b43ae4..afa0caa73 100644 --- a/doc/custom-readers.md +++ b/doc/custom-readers.md @@ -14,7 +14,7 @@ install any additional software to do this. [Lua]: https://www.lua.org -A custom writer is a Lua file that defines a function +A custom reader is a Lua file that defines a function called `Reader`, which takes two arguments: - a string, the raw input to be parsed -- cgit v1.2.3 From 83b5b79c0e4f073198b5af11b9e8a0a4471fcd41 Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Wed, 8 Dec 2021 19:06:48 +0100 Subject: Custom reader: pass list of sources instead of concatenated text The first argument passed to Lua `Reader` functions is no longer a plain string but a richer data structure. The structure can easily be converted to a string by applying `tostring`, but is also a list with elements that contain each the *text* and *name* of each input source as a property of the respective name. A small example is added to the custom reader documentation, showcasing its use in a reader that creates a syntax-highlighted code block for each source code file passed as input. Existing readers must be updated. --- data/creole.lua | 2 +- doc/custom-readers.md | 55 +++++++++++++++++++++++++++++----- pandoc.cabal | 1 + src/Text/Pandoc/Lua/Marshal/Sources.hs | 46 ++++++++++++++++++++++++++++ src/Text/Pandoc/Lua/Orphans.hs | 5 ++++ src/Text/Pandoc/Readers/Custom.hs | 10 +++---- 6 files changed, 104 insertions(+), 15 deletions(-) create mode 100644 src/Text/Pandoc/Lua/Marshal/Sources.hs (limited to 'doc/custom-readers.md') diff --git a/data/creole.lua b/data/creole.lua index 5b7d7f554..590dfc871 100644 --- a/data/creole.lua +++ b/data/creole.lua @@ -186,5 +186,5 @@ G = P{ "Doc", } function Reader(input, reader_options) - return lpeg.match(G, input) + return lpeg.match(G, tostring(input)) end diff --git a/doc/custom-readers.md b/doc/custom-readers.md index afa0caa73..df2de2182 100644 --- a/doc/custom-readers.md +++ b/doc/custom-readers.md @@ -17,7 +17,7 @@ install any additional software to do this. A custom reader is a Lua file that defines a function called `Reader`, which takes two arguments: -- a string, the raw input to be parsed +- the raw input to be parsed, as a list of sources - optionally, a table of reader options, e.g. `{ columns = 62, standalone = true }`. @@ -27,6 +27,16 @@ which is automatically in scope. (Indeed, all of the utility functions that are available for [Lua filters] are available in custom readers, too.) +Each source item corresponds to a file or stream passed to pandoc +containing its text and name. E.g., if a single file `input.txt` +is passed to pandoc, then the list of sources will contain just a +single element `s`, where `s.name == 'input.txt'` and `s.text` +contains the file contents as a string. + +The sources list, as well as each of its elements, can be +converted to a string via the Lua standard library function +`tostring`. + [Lua filters]: https://pandoc.org/lua-filters.html [`pandoc` module]: https://pandoc.org/lua-filters.html#module-pandoc @@ -34,12 +44,20 @@ A minimal example would be ```lua function Reader(input) - return pandoc.Pandoc({ pandoc.CodeBlock(input) }) + return pandoc.Pandoc({ pandoc.CodeBlock(tostring(input)) }) end ``` -This just returns a document containing a big code block with -all of the input. +This just returns a document containing a big code block with all +of the input. Or, to create a separate code block for each input +file, one might write + +``` lua +function Reader(input) + return pandoc.Pandoc(input:map( + function (s) return pandoc.CodeBlock(s.text) end)) +end +``` In a nontrivial reader, you'll want to parse the input. You can do this using standard Lua library functions @@ -84,7 +102,7 @@ G = P{ "Pandoc", } function Reader(input) - return lpeg.match(G, input) + return lpeg.match(G, tostring(input)) end ``` @@ -277,7 +295,7 @@ function Reader(input, reader_options) local refs = {} local thisref = {} local ids = {} - for line in string.gmatch(input, "[^\n]*") do + for line in string.gmatch(tostring(input), "[^\n]*") do key, val = string.match(line, "([A-Z][A-Z0-9]) %- (.*)") if key == "ER" then -- clean up fields @@ -550,7 +568,7 @@ G = P{ "Doc", } function Reader(input, reader_options) - return lpeg.match(G, input) + return lpeg.match(G, tostring(input)) end ``` @@ -614,7 +632,7 @@ end function Reader(input) - local parsed = json.decode(input) + local parsed = json.decode(tostring(input)) local blocks = {} for _,entry in ipairs(parsed.data.children) do @@ -636,3 +654,24 @@ Similar code can be used to consume JSON output from other APIs. Note that the content of the text fields is markdown, so we convert it using `pandoc.read()`. + +# Example: syntax-highlighted code files + +This is a reader that puts the content of each input file into a +code block, sets the file's extension as the block's class to +enable code highlighting, and places the filename as a header +above each code block. + +``` lua +function to_code_block (source) + local _, lang = pandoc.path.split_extension(source.name) + return pandoc.Div{ + pandoc.Header(1, source.name == '' and '' or source.name), + pandoc.CodeBlock(source.text, {class=lang}), + } +end + +function Reader (input, opts) + return pandoc.Pandoc(input:map(to_code_block)) +end +``` diff --git a/pandoc.cabal b/pandoc.cabal index dcf12bf04..92513b911 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -694,6 +694,7 @@ library Text.Pandoc.Lua.Marshal.Context, Text.Pandoc.Lua.Marshal.PandocError, Text.Pandoc.Lua.Marshal.ReaderOptions, + Text.Pandoc.Lua.Marshal.Sources, Text.Pandoc.Lua.Module.MediaBag, Text.Pandoc.Lua.Module.Pandoc, Text.Pandoc.Lua.Module.System, diff --git a/src/Text/Pandoc/Lua/Marshal/Sources.hs b/src/Text/Pandoc/Lua/Marshal/Sources.hs new file mode 100644 index 000000000..7b5262ab5 --- /dev/null +++ b/src/Text/Pandoc/Lua/Marshal/Sources.hs @@ -0,0 +1,46 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# OPTIONS_GHC -fno-warn-orphans #-} +{- | +Module : Text.Pandoc.Lua.Marshaling.Sources +Copyright : © 2021 Albert Krewinkel +License : GNU GPL, version 2 or above +Maintainer : Albert Krewinkel + +Marshal 'Sources'. +-} +module Text.Pandoc.Lua.Marshal.Sources + ( pushSources + ) where + +import Data.Text (Text) +import HsLua as Lua +import Text.Pandoc.Lua.Marshal.List (newListMetatable) +import Text.Pandoc.Sources (Sources (..)) +import Text.Parsec (SourcePos, sourceName) + +-- | Pushes the 'Sources' as a list of lazy Lua objects. +pushSources :: LuaError e => Pusher e Sources +pushSources (Sources srcs) = do + pushList (pushUD typeSource) srcs + newListMetatable "pandoc Sources" $ do + pushName "__tostring" + pushHaskellFunction $ do + sources <- forcePeek $ peekList (peekUD typeSource) (nthBottom 1) + pushText . mconcat $ map snd sources + return 1 + rawset (nth 3) + setmetatable (nth 2) + +-- | Source object type. +typeSource :: LuaError e => DocumentedType e (SourcePos, Text) +typeSource = deftype "pandoc input source" + [ operation Tostring $ lambda + ### liftPure snd + <#> udparam typeSource "srcs" "Source to print in native format" + =#> functionResult pushText "string" "Haskell representation" + ] + [ readonly "name" "source name" + (pushString, sourceName . fst) + , readonly "text" "source text" + (pushText, snd) + ] diff --git a/src/Text/Pandoc/Lua/Orphans.hs b/src/Text/Pandoc/Lua/Orphans.hs index eef05bd27..d5b8f2c5d 100644 --- a/src/Text/Pandoc/Lua/Orphans.hs +++ b/src/Text/Pandoc/Lua/Orphans.hs @@ -22,7 +22,9 @@ import Text.Pandoc.Lua.Marshal.CommonState () import Text.Pandoc.Lua.Marshal.Context () import Text.Pandoc.Lua.Marshal.PandocError() import Text.Pandoc.Lua.Marshal.ReaderOptions () +import Text.Pandoc.Lua.Marshal.Sources (pushSources) import Text.Pandoc.Lua.ErrorConversion () +import Text.Pandoc.Sources (Sources) instance Pushable Pandoc where push = pushPandoc @@ -109,3 +111,6 @@ instance Peekable Version where instance {-# OVERLAPPING #-} Peekable Attr where peek = forcePeek . peekAttr + +instance Pushable Sources where + push = pushSources diff --git a/src/Text/Pandoc/Readers/Custom.hs b/src/Text/Pandoc/Readers/Custom.hs index d7336012b..7b6c99ed8 100644 --- a/src/Text/Pandoc/Readers/Custom.hs +++ b/src/Text/Pandoc/Readers/Custom.hs @@ -17,7 +17,6 @@ Supports custom parsers written in Lua which produce a Pandoc AST. module Text.Pandoc.Readers.Custom ( readCustom ) where import Control.Exception import Control.Monad (when) -import Data.Text (Text) import HsLua as Lua hiding (Operation (Div), render) import HsLua.Class.Peekable (PeekError) import Control.Monad.IO.Class (MonadIO) @@ -26,13 +25,13 @@ import Text.Pandoc.Lua (Global (..), runLua, setGlobals) import Text.Pandoc.Lua.Util (dofileWithTraceback) import Text.Pandoc.Options import Text.Pandoc.Class (PandocMonad) -import Text.Pandoc.Sources (ToSources(..), sourcesToText) +import Text.Pandoc.Sources (Sources, ToSources(..)) -- | Convert custom markup to Pandoc. readCustom :: (PandocMonad m, MonadIO m, ToSources s) => FilePath -> ReaderOptions -> s -> m Pandoc -readCustom luaFile opts sources = do - let input = sourcesToText $ toSources sources +readCustom luaFile opts srcs = do + let input = toSources srcs let globals = [ PANDOC_SCRIPT_FILE luaFile ] res <- runLua $ do setGlobals globals @@ -47,8 +46,7 @@ readCustom luaFile opts sources = do Right doc -> return doc parseCustom :: forall e. PeekError e - => Text + => Sources -> ReaderOptions -> LuaE e Pandoc parseCustom = invoke @e "Reader" - -- cgit v1.2.3 From e88224621de1a8f1be4ea7ad9bf05fe635ddc3cc Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Thu, 9 Dec 2021 08:36:17 +0100 Subject: Custom reader: ensure old Readers continue to work Retry conversion by passing a string instead of sources when the `Reader` fails with a message that hints at an outdated function. A deprecation notice is reported in that case. --- doc/custom-readers.md | 7 +++++ src/Text/Pandoc/Lua/Util.hs | 1 + src/Text/Pandoc/Readers/Custom.hs | 63 +++++++++++++++++++++++++++++---------- 3 files changed, 55 insertions(+), 16 deletions(-) (limited to 'doc/custom-readers.md') diff --git a/doc/custom-readers.md b/doc/custom-readers.md index df2de2182..37b6d6a3e 100644 --- a/doc/custom-readers.md +++ b/doc/custom-readers.md @@ -66,6 +66,13 @@ and fast [lpeg] parsing library, which is automatically in scope. You can also use external Lua libraries (for example, an XML parser). +A previous pandoc version passed a raw string instead of a list +of sources to the Reader function. Reader functions that rely on +this are obsolete, but still supported: Pandoc analyzes any +script error, detecting when code assumed the old behavior. The +code is rerun with raw string input in this case, thereby +ensuring backwards compatibility. + [patterns]: http://lua-users.org/wiki/PatternsTutorial [lpeg]: http://www.inf.puc-rio.br/~roberto/lpeg/ diff --git a/src/Text/Pandoc/Lua/Util.hs b/src/Text/Pandoc/Lua/Util.hs index 6d67d340d..9c6f42b2b 100644 --- a/src/Text/Pandoc/Lua/Util.hs +++ b/src/Text/Pandoc/Lua/Util.hs @@ -13,6 +13,7 @@ Lua utility functions. module Text.Pandoc.Lua.Util ( addField , callWithTraceback + , pcallWithTraceback , dofileWithTraceback ) where diff --git a/src/Text/Pandoc/Readers/Custom.hs b/src/Text/Pandoc/Readers/Custom.hs index 7b6c99ed8..9252a9e45 100644 --- a/src/Text/Pandoc/Readers/Custom.hs +++ b/src/Text/Pandoc/Readers/Custom.hs @@ -1,8 +1,5 @@ -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE ScopedTypeVariables #-} -{-# LANGUAGE TypeApplications #-} {- | Module : Text.Pandoc.Readers.Custom Copyright : Copyright (C) 2021 John MacFarlane @@ -18,20 +15,23 @@ module Text.Pandoc.Readers.Custom ( readCustom ) where import Control.Exception import Control.Monad (when) import HsLua as Lua hiding (Operation (Div), render) -import HsLua.Class.Peekable (PeekError) import Control.Monad.IO.Class (MonadIO) import Text.Pandoc.Definition +import Text.Pandoc.Class (PandocMonad, report) +import Text.Pandoc.Logging import Text.Pandoc.Lua (Global (..), runLua, setGlobals) -import Text.Pandoc.Lua.Util (dofileWithTraceback) +import Text.Pandoc.Lua.PandocLua +import Text.Pandoc.Lua.Marshal.Pandoc (peekPandoc) +import Text.Pandoc.Lua.Util (dofileWithTraceback, callWithTraceback, + pcallWithTraceback) import Text.Pandoc.Options -import Text.Pandoc.Class (PandocMonad) -import Text.Pandoc.Sources (Sources, ToSources(..)) +import Text.Pandoc.Sources (ToSources(..), sourcesToText) +import qualified Data.Text as T -- | Convert custom markup to Pandoc. readCustom :: (PandocMonad m, MonadIO m, ToSources s) => FilePath -> ReaderOptions -> s -> m Pandoc readCustom luaFile opts srcs = do - let input = toSources srcs let globals = [ PANDOC_SCRIPT_FILE luaFile ] res <- runLua $ do setGlobals globals @@ -40,13 +40,44 @@ readCustom luaFile opts srcs = do -- to handle this more gracefully): when (stat /= Lua.OK) Lua.throwErrorAsException - parseCustom input opts + parseCustom case res of Left msg -> throw msg Right doc -> return doc - -parseCustom :: forall e. PeekError e - => Sources - -> ReaderOptions - -> LuaE e Pandoc -parseCustom = invoke @e "Reader" + where + parseCustom = do + let input = toSources srcs + getglobal "Reader" + push input + push opts + pcallWithTraceback 2 1 >>= \case + OK -> forcePeek $ peekPandoc top + ErrRun -> do + -- Caught a runtime error. Check if parsing might work if we + -- pass a string instead of a Sources list, then retry. + runPeek (peekText top) >>= \case + Failure {} -> + -- not a string error object. Bail! + throwErrorAsException + Success errmsg -> do + if "string expected, got pandoc Sources" `T.isInfixOf` errmsg + then do + pop 1 + _ <- unPandocLua $ do + report $ Deprecated "old Reader function signature" $ + T.unlines + [ "Reader functions should accept a sources list; " + , "functions expecting `string` input are deprecated. " + , "Use `tostring` to convert the first argument to a " + , "string." + ] + getglobal "Reader" + push $ sourcesToText input -- push sources as string + push opts + callWithTraceback 2 1 + forcePeek $ peekPandoc top + else + -- nothing we can do here + throwErrorAsException + _ -> -- not a runtime error, we won't be able to recover from that + throwErrorAsException -- cgit v1.2.3