Support for <indexterm>s when reading DocBook (#7607)

* Support for <indexterm>s when reading DocBook * Update implementation status of `<n-ary>` tags * Remove non-idiomatic parentheses * More complete `<indexterm>` support, with tests Co-authored-by: Rowan Rodrik van der Molen <rowan@ytec.nl>
author: Rowan Rodrik van der Molen <bigsmoke@gmail.com> 2021-11-05 17:22:38 +0000
committer: GitHub <noreply@github.com> 2021-11-05 10:22:38 -0700
commit: 7a70a46c0319f279fdee3926abff08922be2f02c (patch)
tree: b49a26843e0db55f457812e8a95c3d37ef981671
parent: 5750f60442923780dda91d63dd11b1904446a2d9 (diff)
download: pandoc-7a70a46c0319f279fdee3926abff08922be2f02c.tar.gz
3 files changed, 236 insertions, 4 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs
index 1c13e597b..bdf802925 100644
--- a/src/Text/Pandoc/Readers/DocBook.hs
+++ b/src/Text/Pandoc/Readers/DocBook.hs
@@ -19,7 +19,7 @@ import Data.Foldable (asum)
 import Data.Generics
 import Data.List (intersperse,elemIndex)
 import Data.List.NonEmpty (nonEmpty)
-import Data.Maybe (fromMaybe,mapMaybe)
+import Data.Maybe (catMaybes,fromMaybe,mapMaybe)
 import Data.Text (Text)
 import qualified Data.Text as T
 import qualified Data.Text.Lazy as TL
@@ -316,7 +316,7 @@ List of all DocBook tags, with [x] indicating implemented,
 [ ] postcode - A postal code in an address
 [x] preface - Introductory matter preceding the first chapter of a book
 [ ] prefaceinfo - Meta-information for a Preface
-[ ] primary - The primary word or phrase under which an index term should be
+[x] primary - The primary word or phrase under which an index term should be
     sorted
 [ ] primaryie - A primary term in an index entry, not in the text
 [ ] printhistory - The printing history of a document
@@ -385,7 +385,7 @@ List of all DocBook tags, with [x] indicating implemented,
 [o] screeninfo - Information about how a screen shot was produced
 [ ] screenshot - A representation of what the user sees or might see on a
     computer screen
-[ ] secondary - A secondary word or phrase in an index term
+[x] secondary - A secondary word or phrase in an index term
 [ ] secondaryie - A secondary term in an index entry, rather than in the text
 [x] sect1 - A top-level section of document
 [x] sect1info - Meta-information for a Sect1
@@ -461,7 +461,7 @@ List of all DocBook tags, with [x] indicating implemented,
 [x] td - A table entry in an HTML table
 [x] term - The word or phrase being defined or described in a variable list
 [ ] termdef - An inline term definition
-[ ] tertiary - A tertiary word or phrase in an index term
+[x] tertiary - A tertiary word or phrase in an index term
 [ ] tertiaryie - A tertiary term in an index entry, rather than in the text
 [ ] textdata - Pointer to external text data
 [ ] textobject - A wrapper for a text description of an object and its
@@ -1080,6 +1080,17 @@ elementToStr :: Content -> Content
 elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing
 elementToStr x = x
 
+childElTextAsAttr :: Text -> Element -> Maybe (Text, Text)
+childElTextAsAttr n e = case findChild q e of
+        Nothing -> Nothing
+        Just childEl -> Just (n, strContentRecursive childEl)
+        where q = QName n (Just "http://docbook.org/ns/docbook") Nothing
+
+attrValueAsOptionalAttr :: Text -> Element -> Maybe (Text, Text)
+attrValueAsOptionalAttr n e = case attrValue n e of
+        "" -> Nothing
+        _ -> Just (n, attrValue n e)
+
 parseInline :: PandocMonad m => Content -> DB m Inlines
 parseInline (Text (CData _ s _)) = return $ text s
 parseInline (CRef ref) =
@@ -1094,6 +1105,28 @@ parseInline (Elem e) =
           if ident /= "" || classes /= []
             then innerInlines (spanWith (ident,classes,[]))
             else innerInlines id
+        "indexterm" -> do
+          let ident = attrValue "id" e
+          let classes = T.words $ attrValue "role" e
+          let attrs =
+                -- In DocBook, <primary>, <secondary>, <tertiary>, <see>, and <seealso>
+                -- have mixed content models. However, because we're representing these
+                -- elements in Pandoc's AST as attributes of a phrase, we flatten all
+                -- the descendant content of these elements.
+                [ childElTextAsAttr "primary" e
+                , childElTextAsAttr "secondary" e
+                , childElTextAsAttr "tertiary" e
+                , childElTextAsAttr "see" e
+                , childElTextAsAttr "seealso" e
+                , attrValueAsOptionalAttr "significance" e
+                , attrValueAsOptionalAttr "startref" e
+                , attrValueAsOptionalAttr "scope" e
+                , attrValueAsOptionalAttr "class" e
+                -- We don't do anything with the "pagenum" attribute, because these only
+                -- occur within literal <index> sections, which is not supported by Pandoc,
+                -- because Pandoc has no concept of pages.
+                ]
+          return $ spanWith (ident, ("indexterm" : classes), (catMaybes attrs)) mempty
         "equation" -> equation e displayMath
         "informalequation" -> equation e displayMath
         "inlineequation" -> equation e math
diff --git a/test/docbook-reader.docbook b/test/docbook-reader.docbook
index c38abda82..00bd84649 100644
--- a/test/docbook-reader.docbook
+++ b/test/docbook-reader.docbook
@@ -1603,4 +1603,16 @@ or here: &lt;http://example.com/&gt;
     </step>
   </procedure>
 </sect1>
+<sect1 id="indexterms">
+  <title>Index terms</title>
+  <para>
+    In the simplest case, index terms<indexterm><primary>index term</primary></indexterm> consists of just a <code>&lt;primary&gt;</code> element, but <indexterm><primary>index term</primary><secondary>multi-level</secondary></indexterm> they can also consist of a <code>&lt;primary&gt;</code> <emph>and</emph> <code>&lt;secondary&gt;</code> element, and <indexterm><primary>index term</primary><secondary>multi-level</secondary><tertiary>3-level</tertiary></indexterm> can even include a <code>&lt;tertiary&gt;</code> term.
+  </para>
+  <para>
+    Index terms can also refer to other index terms: <indexterm><primary>index cross referencing</primary></indexterm><indexterm><primary>index term</primary><secondary>cross references</secondary><see>index cross referencing</see></indexterm>exclusively, using the <code>&lt;see&gt;</code> tag; or <indexterm><primary>index cross referencing</primary><seealso>cross referencing</seealso></indexterm> as a reference to related terms, using the <code>&lt;seealso&gt;</code> tag.
+  </para>
+  <para>
+    <indexterm><primary>food</primary><secondary>big <foreignphrase>baguette</foreignphrase> <strong>supreme</strong></secondary></indexterm>Nested content in index term elements is flattened.
+  </para>
+</sect1>
 </article>
diff --git a/test/docbook-reader.native b/test/docbook-reader.native
index be3819336..7520068b1 100644
--- a/test/docbook-reader.native
+++ b/test/docbook-reader.native
@@ -2930,4 +2930,191 @@ Pandoc
             [ Str "A" , Space , Str "Final" , Space , Str "Step" ]
         ]
       ]
+  , Header
+      1
+      ( "indexterms" , [] , [] )
+      [ Str "Index" , Space , Str "terms" ]
+  , Para
+      [ Str "In"
+      , Space
+      , Str "the"
+      , Space
+      , Str "simplest"
+      , Space
+      , Str "case,"
+      , Space
+      , Str "index"
+      , Space
+      , Str "terms"
+      , Span
+          ( "" , [ "indexterm" ] , [ ( "primary" , "index term" ) ] )
+          []
+      , Space
+      , Str "consists"
+      , Space
+      , Str "of"
+      , Space
+      , Str "just"
+      , Space
+      , Str "a"
+      , Space
+      , Code ( "" , [] , [] ) "<primary>"
+      , Space
+      , Str "element,"
+      , Space
+      , Str "but"
+      , Space
+      , Span
+          ( ""
+          , [ "indexterm" ]
+          , [ ( "primary" , "index term" )
+            , ( "secondary" , "multi-level" )
+            ]
+          )
+          []
+      , Space
+      , Str "they"
+      , Space
+      , Str "can"
+      , Space
+      , Str "also"
+      , Space
+      , Str "consist"
+      , Space
+      , Str "of"
+      , Space
+      , Str "a"
+      , Space
+      , Code ( "" , [] , [] ) "<primary>"
+      , Space
+      , Str "and"
+      , Space
+      , Code ( "" , [] , [] ) "<secondary>"
+      , Space
+      , Str "element,"
+      , Space
+      , Str "and"
+      , Space
+      , Span
+          ( ""
+          , [ "indexterm" ]
+          , [ ( "primary" , "index term" )
+            , ( "secondary" , "multi-level" )
+            , ( "tertiary" , "3-level" )
+            ]
+          )
+          []
+      , Space
+      , Str "can"
+      , Space
+      , Str "even"
+      , Space
+      , Str "include"
+      , Space
+      , Str "a"
+      , Space
+      , Code ( "" , [] , [] ) "<tertiary>"
+      , Space
+      , Str "term."
+      ]
+  , Para
+      [ Str "Index"
+      , Space
+      , Str "terms"
+      , Space
+      , Str "can"
+      , Space
+      , Str "also"
+      , Space
+      , Str "refer"
+      , Space
+      , Str "to"
+      , Space
+      , Str "other"
+      , Space
+      , Str "index"
+      , Space
+      , Str "terms:"
+      , Space
+      , Span
+          ( ""
+          , [ "indexterm" ]
+          , [ ( "primary" , "index cross referencing" ) ]
+          )
+          []
+      , Span
+          ( ""
+          , [ "indexterm" ]
+          , [ ( "primary" , "index term" )
+            , ( "secondary" , "cross references" )
+            , ( "see" , "index cross referencing" )
+            ]
+          )
+          []
+      , Str "exclusively,"
+      , Space
+      , Str "using"
+      , Space
+      , Str "the"
+      , Space
+      , Code ( "" , [] , [] ) "<see>"
+      , Space
+      , Str "tag;"
+      , Space
+      , Str "or"
+      , Space
+      , Span
+          ( ""
+          , [ "indexterm" ]
+          , [ ( "primary" , "index cross referencing" )
+            , ( "seealso" , "cross referencing" )
+            ]
+          )
+          []
+      , Space
+      , Str "as"
+      , Space
+      , Str "a"
+      , Space
+      , Str "reference"
+      , Space
+      , Str "to"
+      , Space
+      , Str "related"
+      , Space
+      , Str "terms,"
+      , Space
+      , Str "using"
+      , Space
+      , Str "the"
+      , Space
+      , Code ( "" , [] , [] ) "<seealso>"
+      , Space
+      , Str "tag."
+      ]
+  , Para
+      [ Span
+          ( ""
+          , [ "indexterm" ]
+          , [ ( "primary" , "food" )
+            , ( "secondary" , "big baguette supreme" )
+            ]
+          )
+          []
+      , Str "Nested"
+      , Space
+      , Str "content"
+      , Space
+      , Str "in"
+      , Space
+      , Str "index"
+      , Space
+      , Str "term"
+      , Space
+      , Str "elements"
+      , Space
+      , Str "is"
+      , Space
+      , Str "flattened."
+      ]
   ]
author	Rowan Rodrik van der Molen <bigsmoke@gmail.com>	2021-11-05 17:22:38 +0000
committer	GitHub <noreply@github.com>	2021-11-05 10:22:38 -0700
commit	7a70a46c0319f279fdee3926abff08922be2f02c (patch)
tree	b49a26843e0db55f457812e8a95c3d37ef981671
parent	5750f60442923780dda91d63dd11b1904446a2d9 (diff)
download	pandoc-7a70a46c0319f279fdee3926abff08922be2f02c.tar.gz