aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/DocBook.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/DocBook.hs')
-rw-r--r--src/Text/Pandoc/Readers/DocBook.hs189
1 files changed, 103 insertions, 86 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs
index b0846e345..c49b82ccf 100644
--- a/src/Text/Pandoc/Readers/DocBook.hs
+++ b/src/Text/Pandoc/Readers/DocBook.hs
@@ -1,7 +1,7 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.DocBook
- Copyright : Copyright (C) 2006-2020 John MacFarlane
+ Copyright : Copyright (C) 2006-2021 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
@@ -12,23 +12,28 @@ Conversion of DocBook XML to 'Pandoc' document.
-}
module Text.Pandoc.Readers.DocBook ( readDocBook ) where
import Control.Monad.State.Strict
-import Data.Char (isSpace, toUpper)
+import Data.Char (isSpace, isLetter)
import Data.Default
import Data.Either (rights)
import Data.Foldable (asum)
import Data.Generics
import Data.List (intersperse,elemIndex)
+import Data.List.NonEmpty (nonEmpty)
import Data.Maybe (fromMaybe,mapMaybe)
import Data.Text (Text)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
+import Control.Monad.Except (throwError)
import Text.HTML.TagSoup.Entity (lookupEntity)
+import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad, report)
import Text.Pandoc.Options
import Text.Pandoc.Logging (LogMessage(..))
-import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces)
+import Text.Pandoc.Shared (safeRead, extractSpaces)
+import Text.Pandoc.Sources (ToSources(..), sourcesToText)
import Text.TeXMath (readMathML, writeTeX)
-import Text.XML.Light
+import Text.Pandoc.XML.Light
{-
@@ -92,7 +97,7 @@ List of all DocBook tags, with [x] indicating implemented,
[x] chapterinfo - Meta-information for a Chapter
[ ] citation - An inline bibliographic reference to another published work
[ ] citebiblioid - A citation of a bibliographic identifier
-[ ] citerefentry - A citation to a reference page
+[x] citerefentry - A citation to a reference page
[ ] citetitle - The title of a cited work
[ ] city - The name of a city in an address
[x] classname - The name of a class, in the object-oriented programming sense
@@ -129,6 +134,7 @@ List of all DocBook tags, with [x] indicating implemented,
[ ] corpcredit - A corporation or organization credited in a document
[ ] corpname - The name of a corporation
[ ] country - The name of a country
+[x] danger - An admonition set off from the text indicating hazardous situation
[ ] database - The name of a database, or part of a database
[x] date - The date of publication or revision of a document
[ ] dedication - A wrapper for the dedication section of a book
@@ -206,7 +212,7 @@ List of all DocBook tags, with [x] indicating implemented,
[x] info - A wrapper for information about a component or other block. (DocBook v5)
[x] informalequation - A displayed mathematical equation without a title
[x] informalexample - A displayed example without a title
-[ ] informalfigure - A untitled figure
+[x] informalfigure - An untitled figure
[ ] informaltable - A table without a title
[ ] initializer - The initializer for a FieldSynopsis
[x] inlineequation - A mathematical equation or expression occurring inline
@@ -535,24 +541,32 @@ instance Default DBState where
, dbContent = [] }
-readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
+readDocBook :: (PandocMonad m, ToSources a)
+ => ReaderOptions
+ -> a
+ -> m Pandoc
readDocBook _ inp = do
- let tree = normalizeTree . parseXML . handleInstructions $ crFilter inp
+ let sources = toSources inp
+ tree <- either (throwError . PandocXMLError "") return $
+ parseXMLContents
+ (TL.fromStrict . handleInstructions . sourcesToText $ sources)
(bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree
return $ Pandoc (dbMeta st') (toList . mconcat $ bs)
--- We treat <?asciidoc-br?> specially (issue #1236), converting it
--- to <br/>, since xml-light doesn't parse the instruction correctly.
--- Other xml instructions are simply removed from the input stream.
+-- We treat certain processing instructions by converting them to tags
+-- beginning "pi-".
handleInstructions :: Text -> Text
-handleInstructions = T.pack . handleInstructions' . T.unpack
-
-handleInstructions' :: String -> String
-handleInstructions' ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions' xs
-handleInstructions' xs = case break (=='<') xs of
- (ys, []) -> ys
- ([], '<':zs) -> '<' : handleInstructions' zs
- (ys, zs) -> ys ++ handleInstructions' zs
+handleInstructions t =
+ let (x,y) = T.breakOn "<?" t
+ in if T.null y
+ then x
+ else
+ let (w,z) = T.breakOn "?>" y
+ in (if T.takeWhile (\c -> isLetter c || c == '-')
+ (T.drop 2 w) `elem` ["asciidoc-br", "dbfo"]
+ then x <> "<pi-" <> T.drop 2 w <> "/>"
+ else x <> w <> T.take 2 z) <>
+ handleInstructions (T.drop 2 z)
getFigure :: PandocMonad m => Element -> DB m Blocks
getFigure e = do
@@ -565,32 +579,14 @@ getFigure e = do
modify $ \st -> st{ dbFigureTitle = mempty, dbFigureId = mempty }
return res
--- normalize input, consolidating adjacent Text and CRef elements
-normalizeTree :: [Content] -> [Content]
-normalizeTree = everywhere (mkT go)
- where go :: [Content] -> [Content]
- go (Text (CData CDataRaw _ _):xs) = xs
- go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
- Text (CData CDataText (s1 ++ s2) z):xs
- go (Text (CData CDataText s1 z):CRef r:xs) =
- Text (CData CDataText (s1 ++ convertEntity r) z):xs
- go (CRef r:Text (CData CDataText s1 z):xs) =
- Text (CData CDataText (convertEntity r ++ s1) z):xs
- go (CRef r1:CRef r2:xs) =
- Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs
- go xs = xs
-
-convertEntity :: String -> String
-convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e)
-
-- convenience function to get an attribute value, defaulting to ""
-attrValue :: String -> Element -> Text
+attrValue :: Text -> Element -> Text
attrValue attr elt =
- maybe "" T.pack (lookupAttrBy (\x -> qName x == attr) (elAttribs elt))
+ fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt))
-- convenience function
named :: Text -> Element -> Bool
-named s e = qName (elName e) == T.unpack s
+named s e = qName (elName e) == s
--
@@ -605,16 +601,24 @@ addMetadataFromElement e = do
Nothing -> return ()
Just z -> addMetaField "author" z
addMetaField "subtitle" e
- addMetaField "author" e
+ addAuthor e
addMetaField "date" e
addMetaField "release" e
addMetaField "releaseinfo" e
return mempty
- where addMetaField fieldname elt =
- case filterChildren (named fieldname) elt of
- [] -> return ()
- [z] -> getInlines z >>= addMeta fieldname
- zs -> mapM getInlines zs >>= addMeta fieldname
+ where
+ addAuthor elt =
+ case filterChildren (named "author") elt of
+ [] -> return ()
+ [z] -> fromAuthor z >>= addMeta "author"
+ zs -> mapM fromAuthor zs >>= addMeta "author"
+ fromAuthor elt =
+ mconcat . intersperse space <$> mapM getInlines (elChildren elt)
+ addMetaField fieldname elt =
+ case filterChildren (named fieldname) elt of
+ [] -> return ()
+ [z] -> getInlines z >>= addMeta fieldname
+ zs -> mapM getInlines zs >>= addMeta fieldname
addMeta :: PandocMonad m => ToMetaValue a => Text -> a -> DB m ()
addMeta field val = modify (setMeta field val)
@@ -627,7 +631,7 @@ isBlockElement :: Content -> Bool
isBlockElement (Elem e) = qName (elName e) `elem` blockTags
isBlockElement _ = False
-blockTags :: [String]
+blockTags :: [Text]
blockTags =
[ "abstract"
, "ackno"
@@ -669,6 +673,7 @@ blockTags =
, "index"
, "info"
, "informalexample"
+ , "informalfigure"
, "informaltable"
, "itemizedlist"
, "linegroup"
@@ -713,8 +718,8 @@ blockTags =
, "variablelist"
] ++ admonitionTags
-admonitionTags :: [String]
-admonitionTags = ["important","caution","note","tip","warning"]
+admonitionTags :: [Text]
+admonitionTags = ["caution","danger","important","note","tip","warning"]
-- Trim leading and trailing newline characters
trimNl :: Text -> Text
@@ -736,9 +741,9 @@ getMediaobject e = do
figTitle <- gets dbFigureTitle
ident <- gets dbFigureId
(imageUrl, attr) <-
- case filterChild (named "imageobject") e of
- Nothing -> return (mempty, nullAttr)
- Just z -> case filterChild (named "imagedata") z of
+ case filterElements (named "imageobject") e of
+ [] -> return (mempty, nullAttr)
+ (z:_) -> case filterChild (named "imagedata") z of
Nothing -> return (mempty, nullAttr)
Just i -> let atVal a = attrValue a i
w = case atVal "width" of
@@ -771,10 +776,10 @@ getBlocks e = mconcat <$>
parseBlock :: PandocMonad m => Content -> DB m Blocks
parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE
-parseBlock (Text (CData _ s _)) = if all isSpace s
+parseBlock (Text (CData _ s _)) = if T.all isSpace s
then return mempty
- else return $ plain $ trimInlines $ text $ T.pack s
-parseBlock (CRef x) = return $ plain $ str $ T.toUpper $ T.pack x
+ else return $ plain $ trimInlines $ text s
+parseBlock (CRef x) = return $ plain $ str $ T.toUpper x
parseBlock (Elem e) =
case qName (elName e) of
"toc" -> skip -- skip TOC, since in pandoc it's autogenerated
@@ -829,7 +834,7 @@ parseBlock (Elem e) =
"refsect2" -> sect 2
"refsect3" -> sect 3
"refsection" -> gets dbSectionLevel >>= sect . (+1)
- l | l `elem` admonitionTags -> parseAdmonition $ T.pack l
+ l | l `elem` admonitionTags -> parseAdmonition l
"area" -> skip
"areaset" -> skip
"areaspec" -> skip
@@ -855,6 +860,7 @@ parseBlock (Elem e) =
"variablelist" -> definitionList <$> deflistitems
"procedure" -> bulletList <$> steps
"figure" -> getFigure e
+ "informalfigure" -> getFigure e
"mediaobject" -> para <$> getMediaobject e
"caption" -> skip
"info" -> addMetadataFromElement e
@@ -890,7 +896,11 @@ parseBlock (Elem e) =
"subtitle" -> return mempty -- handled in parent element
_ -> skip >> getBlocks e
where skip = do
- lift $ report $ IgnoredElement $ T.pack $ qName (elName e)
+ let qn = qName $ elName e
+ let name = if "pi-" `T.isPrefixOf` qn
+ then "<?" <> qn <> "?>"
+ else qn
+ lift $ report $ IgnoredElement name
return mempty
codeBlockWithLang = do
@@ -898,7 +908,7 @@ parseBlock (Elem e) =
"" -> []
x -> [x]
return $ codeBlockWith (attrValue "id" e, classes', [])
- $ trimNl $ T.pack $ strContentRecursive e
+ $ trimNl $ strContentRecursive e
parseBlockquote = do
attrib <- case filterChild (named "attribution") e of
Nothing -> return mempty
@@ -952,17 +962,16 @@ parseBlock (Elem e) =
w <- findAttr (unqual "colwidth") c
n <- safeRead $ "0" <> T.filter (\x ->
(x >= '0' && x <= '9')
- || x == '.') (T.pack w)
+ || x == '.') w
if n > 0 then Just n else Nothing
- let numrows = case bodyrows of
- [] -> 0
- xs -> maximum $ map length xs
+ let numrows = maybe 0 maximum $ nonEmpty
+ $ map length bodyrows
let aligns = case colspecs of
[] -> replicate numrows AlignDefault
cs -> map toAlignment cs
let parseWidth s = safeRead (T.filter (\x -> (x >= '0' && x <= '9')
|| x == '.') s)
- let textWidth = case filterChild (named "?dbfo") e of
+ let textWidth = case filterChild (named "pi-dbfo") e of
Just d -> case attrValue "table-width" d of
"" -> 1.0
w -> fromMaybe 100.0 (parseWidth w) / 100.0
@@ -1035,12 +1044,12 @@ parseMixed container conts = do
x <- parseMixed container rs
return $ p <> b <> x
-parseRow :: PandocMonad m => [String] -> Element -> DB m [Cell]
+parseRow :: PandocMonad m => [Text] -> Element -> DB m [Cell]
parseRow cn = do
let isEntry x = named "entry" x || named "td" x || named "th" x
mapM (parseEntry cn) . filterChildren isEntry
-parseEntry :: PandocMonad m => [String] -> Element -> DB m Cell
+parseEntry :: PandocMonad m => [Text] -> Element -> DB m Cell
parseEntry cn el = do
let colDistance sa ea = do
let iStrt = elemIndex sa cn
@@ -1062,7 +1071,7 @@ getInlines :: PandocMonad m => Element -> DB m Inlines
getInlines e' = trimInlines . mconcat <$>
mapM parseInline (elContent e')
-strContentRecursive :: Element -> String
+strContentRecursive :: Element -> Text
strContentRecursive = strContent .
(\e' -> e'{ elContent = map elementToStr $ elContent e' })
@@ -1071,16 +1080,16 @@ elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing
elementToStr x = x
parseInline :: PandocMonad m => Content -> DB m Inlines
-parseInline (Text (CData _ s _)) = return $ text $ T.pack s
+parseInline (Text (CData _ s _)) = return $ text s
parseInline (CRef ref) =
- return $ text $ maybe (T.toUpper $ T.pack ref) T.pack $ lookupEntity ref
+ return $ text $ maybe (T.toUpper ref) T.pack $ lookupEntity (T.unpack ref)
parseInline (Elem e) =
case qName (elName e) of
"anchor" -> do
return $ spanWith (attrValue "id" e, [], []) mempty
"phrase" -> do
let ident = attrValue "id" e
- let classes = T.words $ attrValue "class" e
+ let classes = T.words $ attrValue "role" e
if ident /= "" || classes /= []
then innerInlines (spanWith (ident,classes,[]))
else innerInlines id
@@ -1103,6 +1112,10 @@ parseInline (Elem e) =
"segmentedlist" -> segmentedList
"classname" -> codeWithLang
"code" -> codeWithLang
+ "citerefentry" -> do
+ let title = maybe mempty strContent $ filterChild (named "refentrytitle") e
+ let manvolnum = maybe mempty (\el -> "(" <> strContent el <> ")") $ filterChild (named "manvolnum") e
+ return $ codeWith ("",["citerefentry"],[]) (title <> manvolnum)
"filename" -> codeWithLang
"envar" -> codeWithLang
"literal" -> codeWithLang
@@ -1125,7 +1138,7 @@ parseInline (Elem e) =
"userinput" -> codeWithLang
"systemitem" -> codeWithLang
"varargs" -> return $ code "(...)"
- "keycap" -> return (str $ T.pack $ strContent e)
+ "keycap" -> return (str $ strContent e)
"keycombo" -> keycombo <$>
mapM parseInline (elContent e)
"menuchoice" -> menuchoice <$>
@@ -1137,17 +1150,17 @@ parseInline (Elem e) =
let title = case attrValue "endterm" e of
"" -> maybe "???" xrefTitleByElem
(findElementById linkend content)
- endterm -> maybe "???" (T.pack . strContent)
+ endterm -> maybe "???" strContent
(findElementById endterm content)
return $ link ("#" <> linkend) "" (text title)
- "email" -> return $ link ("mailto:" <> T.pack (strContent e)) ""
- $ str $ T.pack $ strContent e
- "uri" -> return $ link (T.pack $ strContent e) "" $ str $ T.pack $ strContent e
+ "email" -> return $ link ("mailto:" <> strContent e) ""
+ $ str $ strContent e
+ "uri" -> return $ link (strContent e) "" $ str $ strContent e
"ulink" -> innerInlines (link (attrValue "url" e) "")
"link" -> do
ils <- innerInlines id
let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of
- Just h -> T.pack h
+ Just h -> h
_ -> "#" <> attrValue "linkend" e
let ils' = if ils == mempty then str href else ils
let attr = (attrValue "id" e, T.words $ attrValue "role" e, [])
@@ -1163,12 +1176,15 @@ parseInline (Elem e) =
"title" -> return mempty
"affiliation" -> skip
-- Note: this isn't a real docbook tag; it's what we convert
- -- <?asciidor-br?> to in handleInstructions, above. A kludge to
- -- work around xml-light's inability to parse an instruction.
- "br" -> return linebreak
+ -- <?asciidor-br?> to in handleInstructions, above.
+ "pi-asciidoc-br" -> return linebreak
_ -> skip >> innerInlines id
where skip = do
- lift $ report $ IgnoredElement $ T.pack $ qName (elName e)
+ let qn = qName $ elName e
+ let name = if "pi-" `T.isPrefixOf` qn
+ then "<?" <> qn <> "?>"
+ else qn
+ lift $ report $ IgnoredElement name
return mempty
innerInlines f = extractSpaces f . mconcat <$>
@@ -1177,7 +1193,7 @@ parseInline (Elem e) =
let classes' = case attrValue "language" e of
"" -> []
l -> [l]
- return $ codeWith (attrValue "id" e,classes',[]) $ T.pack $ strContentRecursive e
+ return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e
simpleList = mconcat . intersperse (str "," <> space) <$> mapM getInlines
(filterChildren (named "member") e)
segmentedList = do
@@ -1218,10 +1234,10 @@ parseInline (Elem e) =
"sect5" -> descendantContent "title" el
"cmdsynopsis" -> descendantContent "command" el
"funcsynopsis" -> descendantContent "function" el
- _ -> T.pack $ qName (elName el) ++ "_title"
+ _ -> qName (elName el) <> "_title"
where
xrefLabel = attrValue "xreflabel" el
- descendantContent name = maybe "???" (T.pack . strContent)
+ descendantContent name = maybe "???" strContent
. filterElementName (\n -> qName n == name)
-- | Extract a math equation from an element
@@ -1241,8 +1257,9 @@ equation e constructor =
where
mathMLEquations :: [Text]
mathMLEquations = map writeTeX $ rights $ readMath
- (\x -> qName (elName x) == "math" && qPrefix (elName x) == Just "mml")
- (readMathML . T.pack . showElement)
+ (\x -> qName (elName x) == "math" &&
+ qURI (elName x) == Just "http://www.w3.org/1998/Math/MathML")
+ (readMathML . showElement)
latexEquations :: [Text]
latexEquations = readMath (\x -> qName (elName x) == "mathphrase")
@@ -1256,8 +1273,8 @@ equation e constructor =
-- | Get the actual text stored in a CData block. 'showContent'
-- returns the text still surrounded by the [[CDATA]] tags.
showVerbatimCData :: Content -> Text
-showVerbatimCData (Text (CData _ d _)) = T.pack d
-showVerbatimCData c = T.pack $ showContent c
+showVerbatimCData (Text (CData _ d _)) = d
+showVerbatimCData c = showContent c
-- | Set the prefix of a name to 'Nothing'