diff options
author | John MacFarlane <jgm@berkeley.edu> | 2016-01-21 15:23:50 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2016-01-21 15:23:50 -0800 |
commit | 3b39b16a4b6d2d8078be27d0a1b3fc977c55731f (patch) | |
tree | 0f7fece800c351f4e2aa696224804769e23de26f | |
parent | 7ea4d17509c3b0e987793d389987b87638fc9d55 (diff) | |
parent | 4d74a966c4dae72f627696e6440f8ac530965d6c (diff) | |
download | pandoc-3b39b16a4b6d2d8078be27d0a1b3fc977c55731f.tar.gz |
Merge pull request #2638 from c-forster/teiwriter
Add TEI Writer.
-rw-r--r-- | README | 15 | ||||
-rw-r--r-- | pandoc.cabal | 2 | ||||
-rw-r--r-- | pandoc.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/TEI.hs | 320 | ||||
-rw-r--r-- | tests/Tests/Writers/TEI.hs | 43 | ||||
-rw-r--r-- | tests/test-pandoc.hs | 2 |
7 files changed, 380 insertions, 7 deletions
@@ -22,7 +22,7 @@ markup], [Haddock markup], [OPML], [Emacs Org mode], [DocBook], [OpenDocument], [ODT], [Word docx], [GNU Texinfo], [MediaWiki markup], [DokuWiki markup], [Haddock markup], [EPUB] (v2 or v3), [FictionBook2], [Textile], [groff man] pages, [Emacs Org mode], -[AsciiDoc], [InDesign ICML], and [Slidy], [Slideous], [DZSlides], +[AsciiDoc], [InDesign ICML], [TEI XML], and [Slidy], [Slideous], [DZSlides], [reveal.js] or [S5] HTML slide shows. It can also produce [PDF] output on systems where LaTeX, ConTeXt, or `wkhtmltopdf` is installed. @@ -89,6 +89,7 @@ Markdown can be expected to be lossy. [reveal.js]: http://lab.hakim.se/reveal-js/ [FictionBook2]: http://www.fictionbook.org/index.php/Eng:XML_Schema_Fictionbook_2.1 [InDesign ICML]: https://www.adobe.com/content/dam/Adobe/en/devnet/indesign/cs55-docs/IDML/idml-specification.pdf +[TEI Simple]: https://github.com/TEIC/TEI-Simple Using `pandoc` -------------- @@ -277,11 +278,11 @@ General options `docx` (Word docx), `haddock` (Haddock markup), `rtf` (rich text format), `epub` (EPUB v2 book), `epub3` (EPUB v3), `fb2` (FictionBook2 e-book), `asciidoc` (AsciiDoc), `icml` (InDesign - ICML), `slidy` (Slidy HTML and javascript slide show), `slideous` - (Slideous HTML and javascript slide show), `dzslides` (DZSlides - HTML5 + javascript slide show), `revealjs` (reveal.js HTML5 + - javascript slide show), `s5` (S5 HTML and javascript slide show), - or the path of a custom lua writer (see [Custom + ICML), `tei` (TEI Simple), `slidy` (Slidy HTML and javascript slide + show), `slideous` (Slideous HTML and javascript slide show), + `dzslides` (DZSlides HTML5 + javascript slide show), `revealjs` + (reveal.js HTML5 + javascript slide show), `s5` (S5 HTML and javascript + slide show), or the path of a custom lua writer (see [Custom writers], below). Note that `odt`, `epub`, and `epub3` output will not be directed to *stdout*; an output filename must be specified using the `-o/--output` option. If @@ -471,7 +472,7 @@ General writer options `-s`, `--standalone` : Produce output with an appropriate header and footer (e.g. a - standalone HTML, LaTeX, or RTF file, not a fragment). This option + standalone HTML, LaTeX, TEI, or RTF file, not a fragment). This option is set automatically for `pdf`, `epub`, `epub3`, `fb2`, `docx`, and `odt` output. diff --git a/pandoc.cabal b/pandoc.cabal index 4d2be8e6c..d2fe01eb9 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -362,6 +362,7 @@ Library Text.Pandoc.Writers.Docx, Text.Pandoc.Writers.EPUB, Text.Pandoc.Writers.FB2, + Text.Pandoc.Writers.TEI, Text.Pandoc.PDF, Text.Pandoc.UTF8, Text.Pandoc.Templates, @@ -504,6 +505,7 @@ Test-Suite test-pandoc Tests.Writers.LaTeX Tests.Writers.Docx Tests.Writers.RST + Tests.Writers.TEI Ghc-Options: -rtsopts -Wall -fno-warn-unused-do-bind -threaded Default-Language: Haskell98 @@ -1009,6 +1009,8 @@ defaultWriterName x = ".fb2" -> "fb2" ".opml" -> "opml" ".icml" -> "icml" + ".tei.xml" -> "tei" + ".tei" -> "tei" ['.',y] | y `elem` ['1'..'9'] -> "man" _ -> "html" diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index 3f46648a2..4b2397eb9 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -115,6 +115,7 @@ module Text.Pandoc , writeHaddock , writeCommonMark , writeCustom + , writeTEI -- * Rendering templates and default templates , module Text.Pandoc.Templates -- * Miscellaneous @@ -169,6 +170,7 @@ import Text.Pandoc.Writers.AsciiDoc import Text.Pandoc.Writers.Haddock import Text.Pandoc.Writers.CommonMark import Text.Pandoc.Writers.Custom +import Text.Pandoc.Writers.TEI import Text.Pandoc.Templates import Text.Pandoc.Options import Text.Pandoc.Shared (safeRead, warn, mapLeft, pandocVersion) @@ -304,6 +306,7 @@ writers = [ ,("asciidoc" , PureStringWriter writeAsciiDoc) ,("haddock" , PureStringWriter writeHaddock) ,("commonmark" , PureStringWriter writeCommonMark) + ,("tei" , PureStringWriter writeTEI) ] getDefaultExtensions :: String -> Set Extension diff --git a/src/Text/Pandoc/Writers/TEI.hs b/src/Text/Pandoc/Writers/TEI.hs new file mode 100644 index 000000000..b9e683ab9 --- /dev/null +++ b/src/Text/Pandoc/Writers/TEI.hs @@ -0,0 +1,320 @@ +{-# LANGUAGE OverloadedStrings, PatternGuards #-} +{- +Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Writers.Docbook + Copyright : Copyright (C) 2006-2015 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Conversion of 'Pandoc' documents to Docbook XML. +-} +module Text.Pandoc.Writers.TEI (writeTEI) where +import Text.Pandoc.Definition +import Text.Pandoc.XML +import Text.Pandoc.Shared +import Text.Pandoc.Writers.Shared +import Text.Pandoc.Options +import Text.Pandoc.Templates (renderTemplate') +import Data.List ( stripPrefix, isPrefixOf, isSuffixOf ) +import Data.Char ( toLower ) +import Text.Pandoc.Highlighting ( languages, languagesByExtension ) +import Text.Pandoc.Pretty +import Text.Pandoc.ImageSize +import qualified Text.Pandoc.Builder as B + +-- | Convert list of authors to a docbook <author> section +authorToTEI :: WriterOptions -> [Inline] -> B.Inlines +authorToTEI opts name' = + let name = render Nothing $ inlinesToTEI opts name' + colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + in B.rawInline "tei" $ render colwidth $ + inTagsSimple "author" (text $ escapeStringForXML name) + +-- | Convert Pandoc document to string in Docbook format. +writeTEI :: WriterOptions -> Pandoc -> String +writeTEI opts (Pandoc meta blocks) = + let elements = hierarchicalize blocks + colwidth = if writerWrapText opts == WrapAuto + then Just $ writerColumns opts + else Nothing + render' = render colwidth + opts' = if "/book>" `isSuffixOf` + (trimr $ writerTemplate opts) + then opts{ writerChapters = True } + else opts + startLvl = if writerChapters opts' then 0 else 1 + auths' = map (authorToTEI opts) $ docAuthors meta + meta' = B.setMeta "author" auths' meta + Just metadata = metaToJSON opts + (Just . render colwidth . (vcat . + (map (elementToTEI opts' startLvl)) . hierarchicalize)) + (Just . render colwidth . inlinesToTEI opts') + meta' + main = render' $ vcat (map (elementToTEI opts' startLvl) elements) + context = defField "body" main + $ defField "mathml" (case writerHTMLMathMethod opts of + MathML _ -> True + _ -> False) + $ metadata + in if writerStandalone opts + then renderTemplate' (writerTemplate opts) context + else main + +-- | Convert an Element to TEI. +elementToTEI :: WriterOptions -> Int -> Element -> Doc +elementToTEI opts _ (Blk block) = blockToTEI opts block +elementToTEI opts lvl (Sec _ _num (id',_,_) title elements) = + -- TEI doesn't allow sections with no content, so insert some if needed + let elements' = if null elements + then [Blk (Para [])] + else elements + divType = case lvl of + n | n == 0 -> "chapter" + | n >= 1 && n <= 5 -> "level" ++ show n + | otherwise -> "section" + in inTags True "div" [("type", divType) | not (null id')] $ +-- ("id", writerIdentifierPrefix opts ++ id') | not (null id')] $ + inTagsSimple "head" (inlinesToTEI opts title) $$ + vcat (map (elementToTEI opts (lvl + 1)) elements') + +-- | Convert a list of Pandoc blocks to TEI. +blocksToTEI :: WriterOptions -> [Block] -> Doc +blocksToTEI opts = vcat . map (blockToTEI opts) + +-- | Auxiliary function to convert Plain block to Para. +plainToPara :: Block -> Block +plainToPara (Plain x) = Para x +plainToPara x = x + +-- | Convert a list of pairs of terms and definitions into a TEI +-- list with labels and items. +deflistItemsToTEI :: WriterOptions -> [([Inline],[[Block]])] -> Doc +deflistItemsToTEI opts items = + vcat $ map (\(term, defs) -> deflistItemToTEI opts term defs) items + +-- | Convert a term and a list of blocks into a TEI varlistentry. +deflistItemToTEI :: WriterOptions -> [Inline] -> [[Block]] -> Doc +deflistItemToTEI opts term defs = + let def' = concatMap (map plainToPara) defs + in inTagsIndented "label" (inlinesToTEI opts term) $$ + inTagsIndented "item" (blocksToTEI opts def') + +-- | Convert a list of lists of blocks to a list of TEI list items. +listItemsToTEI :: WriterOptions -> [[Block]] -> Doc +listItemsToTEI opts items = vcat $ map (listItemToTEI opts) items + +-- | Convert a list of blocks into a TEI list item. +listItemToTEI :: WriterOptions -> [Block] -> Doc +listItemToTEI opts item = + inTagsIndented "item" $ blocksToTEI opts $ map plainToPara item + +imageToTEI :: WriterOptions -> Attr -> String -> Doc +imageToTEI _ attr src = selfClosingTag "graphic" $ + ("url", src) : idAndRole attr ++ dims + where + dims = go Width "width" ++ go Height "depth" + go dir dstr = case (dimension dir attr) of + Just a -> [(dstr, show a)] + Nothing -> [] + +-- | Convert a Pandoc block element to TEI. +blockToTEI :: WriterOptions -> Block -> Doc +blockToTEI _ Null = empty +-- Add ids to paragraphs in divs with ids - this is needed for +-- pandoc-citeproc to get link anchors in bibliographies: +blockToTEI opts (Div (ident,_,_) [Para lst]) = + let attribs = [("id", ident) | not (null ident)] in + inTags False "p" attribs $ inlinesToTEI opts lst +blockToTEI opts (Div _ bs) = blocksToTEI opts $ map plainToPara bs +blockToTEI _ (Header _ _ _) = empty -- should not occur after hierarchicalize +-- For TEI simple, text must be within containing block element, so +-- we use plainToPara to ensure that Plain text ends up contained by +-- something. +blockToTEI opts (Plain lst) = blockToTEI opts $ Para lst +-- title beginning with fig: indicates that the image is a figure +--blockToTEI opts (Para [Image attr txt (src,'f':'i':'g':':':_)]) = +-- let alt = inlinesToTEI opts txt +-- capt = if null txt +-- then empty +-- else inTagsSimple "title" alt +-- in inTagsIndented "figure" $ +-- capt $$ +-- (inTagsIndented "mediaobject" $ +-- (inTagsIndented "imageobject" +-- (imageToTEI opts attr src)) $$ +-- inTagsSimple "textobject" (inTagsSimple "phrase" alt)) +blockToTEI opts (Para lst) = + inTags False "p" [] $ inlinesToTEI opts lst +blockToTEI opts (BlockQuote blocks) = + inTagsIndented "quote" $ blocksToTEI opts blocks +blockToTEI _ (CodeBlock (_,classes,_) str) = + text ("<ab type='codeblock " ++ lang ++ "'>") <> cr <> + flush (text (escapeStringForXML str) <> cr <> text "</ab>") + where lang = if null langs + then "" + else escapeStringForXML (head langs) + isLang l = map toLower l `elem` map (map toLower) languages + langsFrom s = if isLang s + then [s] + else languagesByExtension . map toLower $ s + langs = concatMap langsFrom classes +blockToTEI opts (BulletList lst) = + let attribs = [("type", "unordered")] + in inTags True "list" attribs $ listItemsToTEI opts lst +blockToTEI _ (OrderedList _ []) = empty +blockToTEI opts (OrderedList (start, numstyle, _) (first:rest)) = + let attribs = case numstyle of + DefaultStyle -> [] + Decimal -> [("type", "ordered:arabic")] + Example -> [("type", "ordered:arabic")] + UpperAlpha -> [("type", "ordered:upperalpha")] + LowerAlpha -> [("type", "ordered:loweralpha")] + UpperRoman -> [("type", "ordered:upperroman")] + LowerRoman -> [("type", "ordered:lowerroman")] + items = if start == 1 + then listItemsToTEI opts (first:rest) + else (inTags True "item" [("n",show start)] + (blocksToTEI opts $ map plainToPara first)) $$ + listItemsToTEI opts rest + in inTags True "list" attribs items +blockToTEI opts (DefinitionList lst) = + let attribs = [("type", "definition")] + in inTags True "list" attribs $ deflistItemsToTEI opts lst +blockToTEI _ (RawBlock f str) + | f == "tei" = text str -- raw TEI block (should such a thing exist). +-- | f == "html" = text str -- allow html for backwards compatibility + | otherwise = empty +blockToTEI _ HorizontalRule = + selfClosingTag "milestone" [("unit","undefined"), ("type","separator"),("rendition","line")] + +-- | TEI Tables +-- TEI Simple's tables are composed of cells and rows; other +-- table info in the AST is here lossily discard. +blockToTEI opts (Table _ _ _ headers rows) = + let + headers' = tableHeadersToTEI opts headers +-- headers' = if all null headers +-- then return empty +-- else tableRowToTEI opts headers + in + inTags True "table" [] $ + vcat $ [headers'] <> map (tableRowToTEI opts) rows + +tableRowToTEI :: WriterOptions + -> [[Block]] + -> Doc +tableRowToTEI opts cols = + inTagsIndented "row" $ vcat $ map (tableItemToTEI opts) cols + +tableHeadersToTEI :: WriterOptions + -> [[Block]] + -> Doc +tableHeadersToTEI opts cols = + inTags True "row" [("role","label")] $ vcat $ map (tableItemToTEI opts) cols + +tableItemToTEI :: WriterOptions + -> [Block] + -> Doc +tableItemToTEI opts item = + inTags False "cell" [] $ vcat $ map (blockToTEI opts) item + +-- | Convert a list of inline elements to TEI. +inlinesToTEI :: WriterOptions -> [Inline] -> Doc +inlinesToTEI opts lst = hcat $ map (inlineToTEI opts) lst + +-- | Convert an inline element to TEI. +inlineToTEI :: WriterOptions -> Inline -> Doc +inlineToTEI _ (Str str) = text $ escapeStringForXML str +inlineToTEI opts (Emph lst) = + inTags False "hi" [("rendition","simple:italic")] $ inlinesToTEI opts lst +inlineToTEI opts (Strong lst) = + inTags False "hi" [("rendition", "simple:bold")] $ inlinesToTEI opts lst +inlineToTEI opts (Strikeout lst) = + inTags False "hi" [("rendition", "simple:strikethrough")] $ + inlinesToTEI opts lst +inlineToTEI opts (Superscript lst) = + inTags False "hi" [("rendition", "simple:superscript")] $ inlinesToTEI opts lst +inlineToTEI opts (Subscript lst) = + inTags False "hi" [("rendition", "simple:subscript")] $ inlinesToTEI opts lst +inlineToTEI opts (SmallCaps lst) = + inTags False "hi" [("rendition", "simple:smallcaps")] $ + inlinesToTEI opts lst +inlineToTEI opts (Quoted _ lst) = + inTagsSimple "quote" $ inlinesToTEI opts lst +inlineToTEI opts (Cite _ lst) = + inlinesToTEI opts lst +inlineToTEI opts (Span _ ils) = + inlinesToTEI opts ils +inlineToTEI _ (Code _ str) = + inTags False "seg" [("type","code")] $ text (escapeStringForXML str) +-- Distinguish display from inline math by wrapping the former in a "figure." +inlineToTEI _ (Math t str) = + case t of + InlineMath -> inTags False "formula" [("notation","TeX")] $ + text (str) + DisplayMath -> inTags True "figure" [("type","math")] $ + inTags False "formula" [("notation","TeX")] $ text (str) + +inlineToTEI _ (RawInline f x) | f == "tei" = text x + | otherwise = empty +inlineToTEI _ LineBreak = selfClosingTag "lb" [] +inlineToTEI _ Space = space +-- because we use \n for LineBreak, we can't do soft breaks: +inlineToTEI _ SoftBreak = space +inlineToTEI opts (Link attr txt (src, _)) + | Just email <- stripPrefix "mailto:" src = + let emailLink = text $ + escapeStringForXML $ email + in case txt of + [Str s] | escapeURI s == email -> emailLink + _ -> inlinesToTEI opts txt <+> + char '(' <> emailLink <> char ')' + | otherwise = + (if isPrefixOf "#" src + then inTags False "ref" $ ("target", drop 1 src) : idAndRole attr + else inTags False "ref" $ ("target", src) : idAndRole attr ) $ + inlinesToTEI opts txt +inlineToTEI opts (Image attr description (src, tit)) = + let titleDoc = if null tit + then empty + else inTags False "figDesc" [] (text $ escapeStringForXML tit) + imageDesc = if null description + then empty + else inTags False "head" [] (inlinesToTEI opts description) + in inTagsIndented "figure" $ imageDesc $$ + imageToTEI opts attr src $$ titleDoc +inlineToTEI opts (Note contents) = + inTagsIndented "note" $ blocksToTEI opts contents + +idAndRole :: Attr -> [(String, String)] +idAndRole (id',cls,_) = ident ++ role + where + ident = if null id' + then [] + else [("id", id')] + role = if null cls + then [] + else [("role", unwords cls)] + diff --git a/tests/Tests/Writers/TEI.hs b/tests/Tests/Writers/TEI.hs new file mode 100644 index 000000000..56764db9f --- /dev/null +++ b/tests/Tests/Writers/TEI.hs @@ -0,0 +1,43 @@ +{-# LANGUAGE OverloadedStrings #-} +module Tests.Writers.TEI (tests) where + +import Test.Framework +import Text.Pandoc.Builder +import Text.Pandoc +import Tests.Helpers +import Tests.Arbitrary() + +{- + "my test" =: X =?> Y + +is shorthand for + + test html "my test" $ X =?> Y + +which is in turn shorthand for + + test html "my test" (X,Y) +-} + +infix 4 =: +(=:) :: (ToString a, ToPandoc a) + => String -> (a, String) -> Test +(=:) = test (writeTEI def . toPandoc) + +tests :: [Test] +tests = [ testGroup "block elements" + ["para" =: para "Lorem ipsum cetera." + =?> "<p>Lorem ipsum cetera.</p>" + ] + , testGroup "inlines" + [ + "Emphasis" =: emph ("emphasized") + =?> "<p><hi rendition=\"simple:italic\">emphasized</hi></p>" + ,"SingleQuoted" =: singleQuoted (text "quoted material") + =?> "<p><quote>quoted material</quote></p>" + ,"DoubleQuoted" =: doubleQuoted (text "quoted material") + =?> "<p><quote>quoted material</quote></p>" + ,"NestedQuoted" =: doubleQuoted (singleQuoted (text "quoted material")) + =?> "<p><quote><quote>quoted material</quote></quote></p>" + ] + ] diff --git a/tests/test-pandoc.hs b/tests/test-pandoc.hs index f7c2f0c1f..2488917cb 100644 --- a/tests/test-pandoc.hs +++ b/tests/test-pandoc.hs @@ -24,6 +24,7 @@ import qualified Tests.Writers.Plain import qualified Tests.Writers.AsciiDoc import qualified Tests.Writers.Docx import qualified Tests.Writers.RST +import qualified Tests.Writers.TEI import qualified Tests.Shared import qualified Tests.Walk import Text.Pandoc.Shared (inDirectory) @@ -44,6 +45,7 @@ tests = [ testGroup "Old" Tests.Old.tests , testGroup "AsciiDoc" Tests.Writers.AsciiDoc.tests , testGroup "Docx" Tests.Writers.Docx.tests , testGroup "RST" Tests.Writers.RST.tests + , testGroup "TEI" Tests.Writers.TEI.tests ] , testGroup "Readers" [ testGroup "LaTeX" Tests.Readers.LaTeX.tests |