aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2016-01-21 15:23:50 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2016-01-21 15:23:50 -0800
commit3b39b16a4b6d2d8078be27d0a1b3fc977c55731f (patch)
tree0f7fece800c351f4e2aa696224804769e23de26f
parent7ea4d17509c3b0e987793d389987b87638fc9d55 (diff)
parent4d74a966c4dae72f627696e6440f8ac530965d6c (diff)
downloadpandoc-3b39b16a4b6d2d8078be27d0a1b3fc977c55731f.tar.gz
Merge pull request #2638 from c-forster/teiwriter
Add TEI Writer.
-rw-r--r--README15
-rw-r--r--pandoc.cabal2
-rw-r--r--pandoc.hs2
-rw-r--r--src/Text/Pandoc.hs3
-rw-r--r--src/Text/Pandoc/Writers/TEI.hs320
-rw-r--r--tests/Tests/Writers/TEI.hs43
-rw-r--r--tests/test-pandoc.hs2
7 files changed, 380 insertions, 7 deletions
diff --git a/README b/README
index 6d785721b..d0d0ddd6b 100644
--- a/README
+++ b/README
@@ -22,7 +22,7 @@ markup], [Haddock markup], [OPML], [Emacs Org mode], [DocBook],
[OpenDocument], [ODT], [Word docx], [GNU Texinfo], [MediaWiki markup],
[DokuWiki markup], [Haddock markup], [EPUB] (v2 or v3),
[FictionBook2], [Textile], [groff man] pages, [Emacs Org mode],
-[AsciiDoc], [InDesign ICML], and [Slidy], [Slideous], [DZSlides],
+[AsciiDoc], [InDesign ICML], [TEI XML], and [Slidy], [Slideous], [DZSlides],
[reveal.js] or [S5] HTML slide shows. It can also produce [PDF] output
on systems where LaTeX, ConTeXt, or `wkhtmltopdf` is installed.
@@ -89,6 +89,7 @@ Markdown can be expected to be lossy.
[reveal.js]: http://lab.hakim.se/reveal-js/
[FictionBook2]: http://www.fictionbook.org/index.php/Eng:XML_Schema_Fictionbook_2.1
[InDesign ICML]: https://www.adobe.com/content/dam/Adobe/en/devnet/indesign/cs55-docs/IDML/idml-specification.pdf
+[TEI Simple]: https://github.com/TEIC/TEI-Simple
Using `pandoc`
--------------
@@ -277,11 +278,11 @@ General options
`docx` (Word docx), `haddock` (Haddock markup), `rtf` (rich text
format), `epub` (EPUB v2 book), `epub3` (EPUB v3), `fb2`
(FictionBook2 e-book), `asciidoc` (AsciiDoc), `icml` (InDesign
- ICML), `slidy` (Slidy HTML and javascript slide show), `slideous`
- (Slideous HTML and javascript slide show), `dzslides` (DZSlides
- HTML5 + javascript slide show), `revealjs` (reveal.js HTML5 +
- javascript slide show), `s5` (S5 HTML and javascript slide show),
- or the path of a custom lua writer (see [Custom
+ ICML), `tei` (TEI Simple), `slidy` (Slidy HTML and javascript slide
+ show), `slideous` (Slideous HTML and javascript slide show),
+ `dzslides` (DZSlides HTML5 + javascript slide show), `revealjs`
+ (reveal.js HTML5 + javascript slide show), `s5` (S5 HTML and javascript
+ slide show), or the path of a custom lua writer (see [Custom
writers], below). Note that `odt`, `epub`, and
`epub3` output will not be directed to *stdout*; an output
filename must be specified using the `-o/--output` option. If
@@ -471,7 +472,7 @@ General writer options
`-s`, `--standalone`
: Produce output with an appropriate header and footer (e.g. a
- standalone HTML, LaTeX, or RTF file, not a fragment). This option
+ standalone HTML, LaTeX, TEI, or RTF file, not a fragment). This option
is set automatically for `pdf`, `epub`, `epub3`, `fb2`, `docx`, and `odt`
output.
diff --git a/pandoc.cabal b/pandoc.cabal
index 4d2be8e6c..d2fe01eb9 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -362,6 +362,7 @@ Library
Text.Pandoc.Writers.Docx,
Text.Pandoc.Writers.EPUB,
Text.Pandoc.Writers.FB2,
+ Text.Pandoc.Writers.TEI,
Text.Pandoc.PDF,
Text.Pandoc.UTF8,
Text.Pandoc.Templates,
@@ -504,6 +505,7 @@ Test-Suite test-pandoc
Tests.Writers.LaTeX
Tests.Writers.Docx
Tests.Writers.RST
+ Tests.Writers.TEI
Ghc-Options: -rtsopts -Wall -fno-warn-unused-do-bind -threaded
Default-Language: Haskell98
diff --git a/pandoc.hs b/pandoc.hs
index 01f8891d8..72a7592d8 100644
--- a/pandoc.hs
+++ b/pandoc.hs
@@ -1009,6 +1009,8 @@ defaultWriterName x =
".fb2" -> "fb2"
".opml" -> "opml"
".icml" -> "icml"
+ ".tei.xml" -> "tei"
+ ".tei" -> "tei"
['.',y] | y `elem` ['1'..'9'] -> "man"
_ -> "html"
diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs
index 3f46648a2..4b2397eb9 100644
--- a/src/Text/Pandoc.hs
+++ b/src/Text/Pandoc.hs
@@ -115,6 +115,7 @@ module Text.Pandoc
, writeHaddock
, writeCommonMark
, writeCustom
+ , writeTEI
-- * Rendering templates and default templates
, module Text.Pandoc.Templates
-- * Miscellaneous
@@ -169,6 +170,7 @@ import Text.Pandoc.Writers.AsciiDoc
import Text.Pandoc.Writers.Haddock
import Text.Pandoc.Writers.CommonMark
import Text.Pandoc.Writers.Custom
+import Text.Pandoc.Writers.TEI
import Text.Pandoc.Templates
import Text.Pandoc.Options
import Text.Pandoc.Shared (safeRead, warn, mapLeft, pandocVersion)
@@ -304,6 +306,7 @@ writers = [
,("asciidoc" , PureStringWriter writeAsciiDoc)
,("haddock" , PureStringWriter writeHaddock)
,("commonmark" , PureStringWriter writeCommonMark)
+ ,("tei" , PureStringWriter writeTEI)
]
getDefaultExtensions :: String -> Set Extension
diff --git a/src/Text/Pandoc/Writers/TEI.hs b/src/Text/Pandoc/Writers/TEI.hs
new file mode 100644
index 000000000..b9e683ab9
--- /dev/null
+++ b/src/Text/Pandoc/Writers/TEI.hs
@@ -0,0 +1,320 @@
+{-# LANGUAGE OverloadedStrings, PatternGuards #-}
+{-
+Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Writers.Docbook
+ Copyright : Copyright (C) 2006-2015 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of 'Pandoc' documents to Docbook XML.
+-}
+module Text.Pandoc.Writers.TEI (writeTEI) where
+import Text.Pandoc.Definition
+import Text.Pandoc.XML
+import Text.Pandoc.Shared
+import Text.Pandoc.Writers.Shared
+import Text.Pandoc.Options
+import Text.Pandoc.Templates (renderTemplate')
+import Data.List ( stripPrefix, isPrefixOf, isSuffixOf )
+import Data.Char ( toLower )
+import Text.Pandoc.Highlighting ( languages, languagesByExtension )
+import Text.Pandoc.Pretty
+import Text.Pandoc.ImageSize
+import qualified Text.Pandoc.Builder as B
+
+-- | Convert list of authors to a docbook <author> section
+authorToTEI :: WriterOptions -> [Inline] -> B.Inlines
+authorToTEI opts name' =
+ let name = render Nothing $ inlinesToTEI opts name'
+ colwidth = if writerWrapText opts == WrapAuto
+ then Just $ writerColumns opts
+ else Nothing
+ in B.rawInline "tei" $ render colwidth $
+ inTagsSimple "author" (text $ escapeStringForXML name)
+
+-- | Convert Pandoc document to string in Docbook format.
+writeTEI :: WriterOptions -> Pandoc -> String
+writeTEI opts (Pandoc meta blocks) =
+ let elements = hierarchicalize blocks
+ colwidth = if writerWrapText opts == WrapAuto
+ then Just $ writerColumns opts
+ else Nothing
+ render' = render colwidth
+ opts' = if "/book>" `isSuffixOf`
+ (trimr $ writerTemplate opts)
+ then opts{ writerChapters = True }
+ else opts
+ startLvl = if writerChapters opts' then 0 else 1
+ auths' = map (authorToTEI opts) $ docAuthors meta
+ meta' = B.setMeta "author" auths' meta
+ Just metadata = metaToJSON opts
+ (Just . render colwidth . (vcat .
+ (map (elementToTEI opts' startLvl)) . hierarchicalize))
+ (Just . render colwidth . inlinesToTEI opts')
+ meta'
+ main = render' $ vcat (map (elementToTEI opts' startLvl) elements)
+ context = defField "body" main
+ $ defField "mathml" (case writerHTMLMathMethod opts of
+ MathML _ -> True
+ _ -> False)
+ $ metadata
+ in if writerStandalone opts
+ then renderTemplate' (writerTemplate opts) context
+ else main
+
+-- | Convert an Element to TEI.
+elementToTEI :: WriterOptions -> Int -> Element -> Doc
+elementToTEI opts _ (Blk block) = blockToTEI opts block
+elementToTEI opts lvl (Sec _ _num (id',_,_) title elements) =
+ -- TEI doesn't allow sections with no content, so insert some if needed
+ let elements' = if null elements
+ then [Blk (Para [])]
+ else elements
+ divType = case lvl of
+ n | n == 0 -> "chapter"
+ | n >= 1 && n <= 5 -> "level" ++ show n
+ | otherwise -> "section"
+ in inTags True "div" [("type", divType) | not (null id')] $
+-- ("id", writerIdentifierPrefix opts ++ id') | not (null id')] $
+ inTagsSimple "head" (inlinesToTEI opts title) $$
+ vcat (map (elementToTEI opts (lvl + 1)) elements')
+
+-- | Convert a list of Pandoc blocks to TEI.
+blocksToTEI :: WriterOptions -> [Block] -> Doc
+blocksToTEI opts = vcat . map (blockToTEI opts)
+
+-- | Auxiliary function to convert Plain block to Para.
+plainToPara :: Block -> Block
+plainToPara (Plain x) = Para x
+plainToPara x = x
+
+-- | Convert a list of pairs of terms and definitions into a TEI
+-- list with labels and items.
+deflistItemsToTEI :: WriterOptions -> [([Inline],[[Block]])] -> Doc
+deflistItemsToTEI opts items =
+ vcat $ map (\(term, defs) -> deflistItemToTEI opts term defs) items
+
+-- | Convert a term and a list of blocks into a TEI varlistentry.
+deflistItemToTEI :: WriterOptions -> [Inline] -> [[Block]] -> Doc
+deflistItemToTEI opts term defs =
+ let def' = concatMap (map plainToPara) defs
+ in inTagsIndented "label" (inlinesToTEI opts term) $$
+ inTagsIndented "item" (blocksToTEI opts def')
+
+-- | Convert a list of lists of blocks to a list of TEI list items.
+listItemsToTEI :: WriterOptions -> [[Block]] -> Doc
+listItemsToTEI opts items = vcat $ map (listItemToTEI opts) items
+
+-- | Convert a list of blocks into a TEI list item.
+listItemToTEI :: WriterOptions -> [Block] -> Doc
+listItemToTEI opts item =
+ inTagsIndented "item" $ blocksToTEI opts $ map plainToPara item
+
+imageToTEI :: WriterOptions -> Attr -> String -> Doc
+imageToTEI _ attr src = selfClosingTag "graphic" $
+ ("url", src) : idAndRole attr ++ dims
+ where
+ dims = go Width "width" ++ go Height "depth"
+ go dir dstr = case (dimension dir attr) of
+ Just a -> [(dstr, show a)]
+ Nothing -> []
+
+-- | Convert a Pandoc block element to TEI.
+blockToTEI :: WriterOptions -> Block -> Doc
+blockToTEI _ Null = empty
+-- Add ids to paragraphs in divs with ids - this is needed for
+-- pandoc-citeproc to get link anchors in bibliographies:
+blockToTEI opts (Div (ident,_,_) [Para lst]) =
+ let attribs = [("id", ident) | not (null ident)] in
+ inTags False "p" attribs $ inlinesToTEI opts lst
+blockToTEI opts (Div _ bs) = blocksToTEI opts $ map plainToPara bs
+blockToTEI _ (Header _ _ _) = empty -- should not occur after hierarchicalize
+-- For TEI simple, text must be within containing block element, so
+-- we use plainToPara to ensure that Plain text ends up contained by
+-- something.
+blockToTEI opts (Plain lst) = blockToTEI opts $ Para lst
+-- title beginning with fig: indicates that the image is a figure
+--blockToTEI opts (Para [Image attr txt (src,'f':'i':'g':':':_)]) =
+-- let alt = inlinesToTEI opts txt
+-- capt = if null txt
+-- then empty
+-- else inTagsSimple "title" alt
+-- in inTagsIndented "figure" $
+-- capt $$
+-- (inTagsIndented "mediaobject" $
+-- (inTagsIndented "imageobject"
+-- (imageToTEI opts attr src)) $$
+-- inTagsSimple "textobject" (inTagsSimple "phrase" alt))
+blockToTEI opts (Para lst) =
+ inTags False "p" [] $ inlinesToTEI opts lst
+blockToTEI opts (BlockQuote blocks) =
+ inTagsIndented "quote" $ blocksToTEI opts blocks
+blockToTEI _ (CodeBlock (_,classes,_) str) =
+ text ("<ab type='codeblock " ++ lang ++ "'>") <> cr <>
+ flush (text (escapeStringForXML str) <> cr <> text "</ab>")
+ where lang = if null langs
+ then ""
+ else escapeStringForXML (head langs)
+ isLang l = map toLower l `elem` map (map toLower) languages
+ langsFrom s = if isLang s
+ then [s]
+ else languagesByExtension . map toLower $ s
+ langs = concatMap langsFrom classes
+blockToTEI opts (BulletList lst) =
+ let attribs = [("type", "unordered")]
+ in inTags True "list" attribs $ listItemsToTEI opts lst
+blockToTEI _ (OrderedList _ []) = empty
+blockToTEI opts (OrderedList (start, numstyle, _) (first:rest)) =
+ let attribs = case numstyle of
+ DefaultStyle -> []
+ Decimal -> [("type", "ordered:arabic")]
+ Example -> [("type", "ordered:arabic")]
+ UpperAlpha -> [("type", "ordered:upperalpha")]
+ LowerAlpha -> [("type", "ordered:loweralpha")]
+ UpperRoman -> [("type", "ordered:upperroman")]
+ LowerRoman -> [("type", "ordered:lowerroman")]
+ items = if start == 1
+ then listItemsToTEI opts (first:rest)
+ else (inTags True "item" [("n",show start)]
+ (blocksToTEI opts $ map plainToPara first)) $$
+ listItemsToTEI opts rest
+ in inTags True "list" attribs items
+blockToTEI opts (DefinitionList lst) =
+ let attribs = [("type", "definition")]
+ in inTags True "list" attribs $ deflistItemsToTEI opts lst
+blockToTEI _ (RawBlock f str)
+ | f == "tei" = text str -- raw TEI block (should such a thing exist).
+-- | f == "html" = text str -- allow html for backwards compatibility
+ | otherwise = empty
+blockToTEI _ HorizontalRule =
+ selfClosingTag "milestone" [("unit","undefined"), ("type","separator"),("rendition","line")]
+
+-- | TEI Tables
+-- TEI Simple's tables are composed of cells and rows; other
+-- table info in the AST is here lossily discard.
+blockToTEI opts (Table _ _ _ headers rows) =
+ let
+ headers' = tableHeadersToTEI opts headers
+-- headers' = if all null headers
+-- then return empty
+-- else tableRowToTEI opts headers
+ in
+ inTags True "table" [] $
+ vcat $ [headers'] <> map (tableRowToTEI opts) rows
+
+tableRowToTEI :: WriterOptions
+ -> [[Block]]
+ -> Doc
+tableRowToTEI opts cols =
+ inTagsIndented "row" $ vcat $ map (tableItemToTEI opts) cols
+
+tableHeadersToTEI :: WriterOptions
+ -> [[Block]]
+ -> Doc
+tableHeadersToTEI opts cols =
+ inTags True "row" [("role","label")] $ vcat $ map (tableItemToTEI opts) cols
+
+tableItemToTEI :: WriterOptions
+ -> [Block]
+ -> Doc
+tableItemToTEI opts item =
+ inTags False "cell" [] $ vcat $ map (blockToTEI opts) item
+
+-- | Convert a list of inline elements to TEI.
+inlinesToTEI :: WriterOptions -> [Inline] -> Doc
+inlinesToTEI opts lst = hcat $ map (inlineToTEI opts) lst
+
+-- | Convert an inline element to TEI.
+inlineToTEI :: WriterOptions -> Inline -> Doc
+inlineToTEI _ (Str str) = text $ escapeStringForXML str
+inlineToTEI opts (Emph lst) =
+ inTags False "hi" [("rendition","simple:italic")] $ inlinesToTEI opts lst
+inlineToTEI opts (Strong lst) =
+ inTags False "hi" [("rendition", "simple:bold")] $ inlinesToTEI opts lst
+inlineToTEI opts (Strikeout lst) =
+ inTags False "hi" [("rendition", "simple:strikethrough")] $
+ inlinesToTEI opts lst
+inlineToTEI opts (Superscript lst) =
+ inTags False "hi" [("rendition", "simple:superscript")] $ inlinesToTEI opts lst
+inlineToTEI opts (Subscript lst) =
+ inTags False "hi" [("rendition", "simple:subscript")] $ inlinesToTEI opts lst
+inlineToTEI opts (SmallCaps lst) =
+ inTags False "hi" [("rendition", "simple:smallcaps")] $
+ inlinesToTEI opts lst
+inlineToTEI opts (Quoted _ lst) =
+ inTagsSimple "quote" $ inlinesToTEI opts lst
+inlineToTEI opts (Cite _ lst) =
+ inlinesToTEI opts lst
+inlineToTEI opts (Span _ ils) =
+ inlinesToTEI opts ils
+inlineToTEI _ (Code _ str) =
+ inTags False "seg" [("type","code")] $ text (escapeStringForXML str)
+-- Distinguish display from inline math by wrapping the former in a "figure."
+inlineToTEI _ (Math t str) =
+ case t of
+ InlineMath -> inTags False "formula" [("notation","TeX")] $
+ text (str)
+ DisplayMath -> inTags True "figure" [("type","math")] $
+ inTags False "formula" [("notation","TeX")] $ text (str)
+
+inlineToTEI _ (RawInline f x) | f == "tei" = text x
+ | otherwise = empty
+inlineToTEI _ LineBreak = selfClosingTag "lb" []
+inlineToTEI _ Space = space
+-- because we use \n for LineBreak, we can't do soft breaks:
+inlineToTEI _ SoftBreak = space
+inlineToTEI opts (Link attr txt (src, _))
+ | Just email <- stripPrefix "mailto:" src =
+ let emailLink = text $
+ escapeStringForXML $ email
+ in case txt of
+ [Str s] | escapeURI s == email -> emailLink
+ _ -> inlinesToTEI opts txt <+>
+ char '(' <> emailLink <> char ')'
+ | otherwise =
+ (if isPrefixOf "#" src
+ then inTags False "ref" $ ("target", drop 1 src) : idAndRole attr
+ else inTags False "ref" $ ("target", src) : idAndRole attr ) $
+ inlinesToTEI opts txt
+inlineToTEI opts (Image attr description (src, tit)) =
+ let titleDoc = if null tit
+ then empty
+ else inTags False "figDesc" [] (text $ escapeStringForXML tit)
+ imageDesc = if null description
+ then empty
+ else inTags False "head" [] (inlinesToTEI opts description)
+ in inTagsIndented "figure" $ imageDesc $$
+ imageToTEI opts attr src $$ titleDoc
+inlineToTEI opts (Note contents) =
+ inTagsIndented "note" $ blocksToTEI opts contents
+
+idAndRole :: Attr -> [(String, String)]
+idAndRole (id',cls,_) = ident ++ role
+ where
+ ident = if null id'
+ then []
+ else [("id", id')]
+ role = if null cls
+ then []
+ else [("role", unwords cls)]
+
diff --git a/tests/Tests/Writers/TEI.hs b/tests/Tests/Writers/TEI.hs
new file mode 100644
index 000000000..56764db9f
--- /dev/null
+++ b/tests/Tests/Writers/TEI.hs
@@ -0,0 +1,43 @@
+{-# LANGUAGE OverloadedStrings #-}
+module Tests.Writers.TEI (tests) where
+
+import Test.Framework
+import Text.Pandoc.Builder
+import Text.Pandoc
+import Tests.Helpers
+import Tests.Arbitrary()
+
+{-
+ "my test" =: X =?> Y
+
+is shorthand for
+
+ test html "my test" $ X =?> Y
+
+which is in turn shorthand for
+
+ test html "my test" (X,Y)
+-}
+
+infix 4 =:
+(=:) :: (ToString a, ToPandoc a)
+ => String -> (a, String) -> Test
+(=:) = test (writeTEI def . toPandoc)
+
+tests :: [Test]
+tests = [ testGroup "block elements"
+ ["para" =: para "Lorem ipsum cetera."
+ =?> "<p>Lorem ipsum cetera.</p>"
+ ]
+ , testGroup "inlines"
+ [
+ "Emphasis" =: emph ("emphasized")
+ =?> "<p><hi rendition=\"simple:italic\">emphasized</hi></p>"
+ ,"SingleQuoted" =: singleQuoted (text "quoted material")
+ =?> "<p><quote>quoted material</quote></p>"
+ ,"DoubleQuoted" =: doubleQuoted (text "quoted material")
+ =?> "<p><quote>quoted material</quote></p>"
+ ,"NestedQuoted" =: doubleQuoted (singleQuoted (text "quoted material"))
+ =?> "<p><quote><quote>quoted material</quote></quote></p>"
+ ]
+ ]
diff --git a/tests/test-pandoc.hs b/tests/test-pandoc.hs
index f7c2f0c1f..2488917cb 100644
--- a/tests/test-pandoc.hs
+++ b/tests/test-pandoc.hs
@@ -24,6 +24,7 @@ import qualified Tests.Writers.Plain
import qualified Tests.Writers.AsciiDoc
import qualified Tests.Writers.Docx
import qualified Tests.Writers.RST
+import qualified Tests.Writers.TEI
import qualified Tests.Shared
import qualified Tests.Walk
import Text.Pandoc.Shared (inDirectory)
@@ -44,6 +45,7 @@ tests = [ testGroup "Old" Tests.Old.tests
, testGroup "AsciiDoc" Tests.Writers.AsciiDoc.tests
, testGroup "Docx" Tests.Writers.Docx.tests
, testGroup "RST" Tests.Writers.RST.tests
+ , testGroup "TEI" Tests.Writers.TEI.tests
]
, testGroup "Readers"
[ testGroup "LaTeX" Tests.Readers.LaTeX.tests